diff --git a/.github/workflows/ifu.yml b/.github/workflows/ifu.yml
new file mode 100644
index 000000000000..82c13c6a12a3
--- /dev/null
+++ b/.github/workflows/ifu.yml
@@ -0,0 +1,63 @@
+name: IntegrateFromUpstream
+on:
+#  schedule:
+#    # verified via crontab.guru website. “At 06:55 on Monday.”
+#    - cron: '55 6 * * 1'
+  workflow_dispatch:
+    inputs:
+      message:
+        description: 'Reason for manual trigger'
+        required: false
+        default: 'refresh branch'
+jobs:
+  IntegrateFromUpstream:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+      - name: Get Current Date
+        id: date
+        run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+      - name: Extract branch name
+        id: extract_branch
+        shell: bash
+        run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+      - name: Fetch and Merge
+        id: fetch_and_merge
+        run: |
+          echo "Reason for trigger: ${{ github.event.inputs.message }}"
+          echo "Actor for trigger: ${{ github.actor }}"
+          git config user.name github-actions
+          git config user.email github-actions@github.com
+          git remote add upstream https://github.com/microsoft/DeepSpeed
+          git fetch upstream master
+          git merge upstream/master 
+          # Since we use our own fork of DeepSpeedExamples, ignore theirs
+          git checkout HEAD DeepSpeedExamples
+      - name: Create Pull Request
+        id: create_pull_request
+        uses: jithunnair-amd/create-pull-request@v3
+        with:
+#          token: ${{ secrets.PAT }}
+          branch: IFU-${{ steps.extract_branch.outputs.branch }}-${{ steps.date.outputs.date }}
+          title: IFU-${{ steps.extract_branch.outputs.branch }}-${{ steps.date.outputs.date }}
+          assignees: rraminen
+          reviewers: jithunnair-amd
+          delete-branch: true
+      - name: Send email
+        uses: jithunnair-amd/action-send-mail@v3.1.0
+        if: always()
+        with:
+          server_address: smtp.gmail.com
+          server_port: 465
+          secure: true
+          username: ${{ secrets.GMAIL_USERNAME }}
+          password: ${{ secrets.GMAIL_PASSWORD }}
+          subject: IFU to ${{ steps.extract_branch.outputs.branch }} branch of ${{ github.repository }}
+          to: Jithun.Nair@amd.com, RamyaSai.Ramineni@amd.com
+          from: ${{ secrets.GMAIL_USERNAME }}
+          html_body: |
+            <b>Fetch and Merge</b>: ${{ steps.fetch_and_merge.outcome }} <br/>
+            <b>Create Pull Request</b>: ${{ steps.create_pull_request.outcome }} <br/>
+            <b>Pull request</b>: <a href="${{ steps.create_pull_request.outputs.pull-request-url }}">${{ steps.create_pull_request.outputs.pull-request-url }}</a> <br/>
diff --git a/.gitmodules b/.gitmodules
index 37adb6f39e5c..072bdc50817e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,3 @@
 [submodule "DeepSpeedExamples"]
 	path = DeepSpeedExamples
-	url = https://github.com/microsoft/DeepSpeedExamples
-	branch = master
+	url = https://github.com/ROCmSoftwarePlatform/DeepSpeedExamples.git
diff --git a/DeepSpeedExamples b/DeepSpeedExamples
index 25d73cf73fb3..36846da89d5b 160000
--- a/DeepSpeedExamples
+++ b/DeepSpeedExamples
@@ -1 +1 @@
-Subproject commit 25d73cf73fb3dc66faefa141b7319526555be9fc
+Subproject commit 36846da89d5be7e13465f95be7074b4ccd5898cd
diff --git a/csrc/includes/cublas_wrappers.h b/csrc/includes/cublas_wrappers.h
index 19d726c3bcd3..9bb6cc30f6ae 100644
--- a/csrc/includes/cublas_wrappers.h
+++ b/csrc/includes/cublas_wrappers.h
@@ -5,7 +5,9 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
+#ifndef __HIP_PLATFORM_HCC__
 #include <mma.h>
+#endif
 #include <stdio.h>
 
 int cublas_gemm_ex(cublasHandle_t handle,
@@ -19,7 +21,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const float* A,
                    const float* B,
                    float* C,
+#ifdef __HIP_PLATFORM_HCC__
+                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
+#endif
 
 int cublas_gemm_ex(cublasHandle_t handle,
                    cublasOperation_t transa,
@@ -32,7 +38,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const __half* A,
                    const __half* B,
                    __half* C,
+#ifdef __HIP_PLATFORM_HCC__
+                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
 
 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int m,
@@ -49,7 +59,11 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_B,
                                 int stride_C,
                                 int batch,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                                 cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
+#endif
 
 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int m,
@@ -66,4 +80,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_B,
                                 int stride_C,
                                 int batch,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                                 cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
diff --git a/csrc/includes/custom_cuda_layers.h b/csrc/includes/custom_cuda_layers.h
index eb10234d11f2..57d372f6ee14 100644
--- a/csrc/includes/custom_cuda_layers.h
+++ b/csrc/includes/custom_cuda_layers.h
@@ -5,7 +5,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#ifdef __HIP_PLATFORM_HCC__
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
 #include <curand_kernel.h>
 
 #include "context.h"
diff --git a/csrc/includes/feed_forward.h b/csrc/includes/feed_forward.h
index 7b7379d9b998..3a59d56ee6cd 100644
--- a/csrc/includes/feed_forward.h
+++ b/csrc/includes/feed_forward.h
@@ -43,7 +43,11 @@ class FeedForward {
                        weights,
                        input_ptr,
                        out,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[0]));
+#else
                        cublasGemmAlgo_t(config_.gemm_algos[0]));
+#endif
     }
     void Backward(int bsz,
                   const T* out_grad,
@@ -68,7 +72,11 @@ class FeedForward {
                        input_ptr,
                        out_grad,
                        weights_grad,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[1]));
+#else
                        cublasGemmAlgo_t(config_.gemm_algos[1]));
+#endif
 
         cublas_gemm_ex(_cublasHandle,
                        CUBLAS_OP_N,
@@ -81,7 +89,11 @@ class FeedForward {
                        weights,
                        out_grad,
                        inp_grad_out,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[2]));
+#else
                        cublasGemmAlgo_t(config_.gemm_algos[2]));
+#endif
 
         launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
     }
diff --git a/csrc/includes/gemm_test.h b/csrc/includes/gemm_test.h
index b920896b419e..647861c09afd 100644
--- a/csrc/includes/gemm_test.h
+++ b/csrc/includes/gemm_test.h
@@ -2,7 +2,9 @@
 #pragma once
 
 #include <cuda_fp16.h>
+#ifndef __HIP_PLATFORM_HCC__
 #include <cuda_profiler_api.h>
+#endif
 #include <array>
 #include <cstdio>
 #include <cstdlib>
@@ -58,7 +60,11 @@ class GemmTest {
                            B,
                            A,
                            C,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
                            static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         int algo_bw1 = Run(loops, [=](int algo) {
@@ -73,7 +79,11 @@ class GemmTest {
                            A,
                            C,
                            B,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
                            static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         int algo_bw2 = Run(loops, [=](int algo) {
@@ -88,7 +98,11 @@ class GemmTest {
                            B,
                            C,
                            A,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
                            static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
@@ -100,8 +114,13 @@ class GemmTest {
         float fast_latency = (std::numeric_limits<float>::max)();
         int fast_algo = 0;
 
+#ifdef __HIP_PLATFORM_HCC__
+        for (int algo = (int)rocblas_gemm_algo_standard;
+             algo <= (int)rocblas_gemm_algo_standard;
+#else
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
              algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+#endif
              algo++) {
             int warm_up = 5;
             for (int i = 0; i < warm_up; ++i) f(algo);
@@ -186,7 +205,11 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
                                         static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         int algo_bw1 = Run(loops, [=](int algo) {
@@ -216,7 +239,11 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
                                         static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         int algo_bw2 = Run(loops, [=](int algo) {
@@ -243,7 +270,11 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
                                         static_cast<cublasGemmAlgo_t>(algo));
+#endif
         });
 
         return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
@@ -255,8 +286,13 @@ class StridedGemmTest {
         float fast_latency = (std::numeric_limits<float>::max)();
         int fast_algo = 0;
 
+#ifdef __HIP_PLATFORM_HCC__
+        for (int algo = (int)rocblas_gemm_algo_standard;
+             algo <= (int)rocblas_gemm_algo_standard;
+#else
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
              algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+#endif
              algo++) {
             int warm_up = 5;
             for (int i = 0; i < warm_up; ++i) f(algo);
diff --git a/csrc/includes/general_kernels.h b/csrc/includes/general_kernels.h
index 588cf2aaa048..62416f0124dc 100644
--- a/csrc/includes/general_kernels.h
+++ b/csrc/includes/general_kernels.h
@@ -3,7 +3,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#ifdef __HIP_PLATFORM_HCC__
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
 #include <curand_kernel.h>
 
 #include "context.h"
diff --git a/csrc/includes/strided_batch_gemm.h b/csrc/includes/strided_batch_gemm.h
index 44a1b313b986..6233e3e06240 100644
--- a/csrc/includes/strided_batch_gemm.h
+++ b/csrc/includes/strided_batch_gemm.h
@@ -72,7 +72,11 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                    rocblas_gemm_algo(_config.gemm_algos[0]));
+#else
                                     cublasGemmAlgo_t(_config.gemm_algos[0]));
+#endif
     }
 
     void ForwardPlusSave(T* output, const T* _buffer_a, const T* _buffer_b, cublasHandle_t handle)
@@ -96,7 +100,11 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     _config.batch_size,
+#ifdef __HIP_PLATFORM_HCC__
+                                    rocblas_gemm_algo(_config.gemm_algos[0]));
+#else
                                     cublasGemmAlgo_t(_config.gemm_algos[0]));
+#endif
 
         k_buf = _buffer_a;
         q_buf = _buffer_b;
@@ -136,7 +144,11 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                    rocblas_gemm_algo(_config.gemm_algos[1]));
+#else
                                     cublasGemmAlgo_t(_config.gemm_algos[1]));
+#endif
 
         // A need to transpose.
         cublasOperation_t op_a = (_config.op_A == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
@@ -161,7 +173,11 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                    rocblas_gemm_algo(_config.gemm_algos[2]));
+#else
                                     cublasGemmAlgo_t(_config.gemm_algos[2]));
+#endif
     }
 
     inline int GetN() const { return _config.k; }
diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index 0448a45368b9..10a17e98a13d 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -14,7 +14,11 @@
 #include <iostream>
 
 //#include <helper_functions.h>
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
 #include <cuda_runtime_api.h>
 #include <stdio.h>
 
diff --git a/csrc/quantization/quantizer.cu b/csrc/quantization/quantizer.cu
index c48ae38969e3..f79c3ecb1e12 100644
--- a/csrc/quantization/quantizer.cu
+++ b/csrc/quantization/quantizer.cu
@@ -5,7 +5,7 @@ namespace cg = cooperative_groups;
 
 __global__ void qunatize_kernel(__half* vals, int group_size, int num_bits)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -206,7 +206,7 @@ __global__ void sr_qunatize_kernel(__half* vals,
                                    int num_bits,
                                    std::pair<uint64_t, uint64_t> seed)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -484,7 +484,7 @@ template void launch_sr_qunatize_kernel(__half* vals,
 
 __global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -729,7 +729,7 @@ __global__ void sr_qunatize_kernel_asym(__half* vals,
                                         int num_bits,
                                         std::pair<uint64_t, uint64_t> seed)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
diff --git a/csrc/transformer/cublas_wrappers.cu b/csrc/transformer/cublas_wrappers.cu
index 72b62386ea6d..d94fc5169efb 100644
--- a/csrc/transformer/cublas_wrappers.cu
+++ b/csrc/transformer/cublas_wrappers.cu
@@ -1,5 +1,19 @@
 #include "cublas_wrappers.h"
 
+#ifdef __HIP_PLATFORM_HCC__
+int cublas_gemm_ex(rocblas_handle handle,
+                   rocblas_operation transa,
+                   rocblas_operation transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha,
+                   const float* beta,
+                   const float* A,
+                   const float* B,
+                   float* C,
+                   rocblas_gemm_algo algo)
+#else
 int cublas_gemm_ex(cublasHandle_t handle,
                    cublasOperation_t transa,
                    cublasOperation_t transb,
@@ -12,7 +26,34 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const float* B,
                    float* C,
                    cublasGemmAlgo_t algo)
+#endif
 {
+#ifdef __HIP_PLATFORM_HCC__
+    rocblas_status status = rocblas_gemm_ex(handle,
+                                            transa,
+                                            transb,
+                                            m,
+                                            n,
+                                            k,
+                                            (const void*)alpha,
+                                            (const void*)A,
+                                            rocblas_datatype_f32_r,
+                                            (transa == rocblas_operation_none) ? m : k,
+                                            (const void*)B,
+                                            rocblas_datatype_f32_r,
+                                            (transb == rocblas_operation_none) ? k : n,
+                                            (const void*)beta,
+                                            C,
+                                            rocblas_datatype_f32_r,
+                                            m,
+                                            C,
+                                            rocblas_datatype_f32_r,
+                                            m,
+                                            rocblas_datatype_f32_r,
+                                            algo,
+					    0,
+					    0);
+#else
     cublasStatus_t status = cublasGemmEx(handle,
                                          transa,
                                          transb,
@@ -32,8 +73,13 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                          m,
                                          CUDA_R_32F,
                                          algo);
+#endif
 
+#ifdef __HIP_PLATFORM_HCC__
+    if (status != rocblas_status_success) {
+#else
     if (status != CUBLAS_STATUS_SUCCESS) {
+#endif
         fprintf(stderr,
                 "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
                 m,
@@ -45,6 +91,20 @@ int cublas_gemm_ex(cublasHandle_t handle,
     return 0;
 }
 
+#ifdef __HIP_PLATFORM_HCC__
+int cublas_gemm_ex(rocblas_handle handle,
+                   rocblas_operation transa,
+                   rocblas_operation transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha,
+                   const float* beta,
+                   const __half* A,
+                   const __half* B,
+                   __half* C,
+                   rocblas_gemm_algo algo)
+#else
 int cublas_gemm_ex(cublasHandle_t handle,
                    cublasOperation_t transa,
                    cublasOperation_t transb,
@@ -57,7 +117,34 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const __half* B,
                    __half* C,
                    cublasGemmAlgo_t algo)
+#endif
 {
+#ifdef __HIP_PLATFORM_HCC__
+    rocblas_status status = rocblas_gemm_ex(handle,
+                                         transa,
+                                         transb,
+                                         m,
+                                         n,
+                                         k,
+                                         (const void*)alpha,
+                                         (const void*)A,
+                                         rocblas_datatype_f16_r ,
+                                         (transa == rocblas_operation_none) ? m : k,
+                                         (const void*)B,
+                                         rocblas_datatype_f16_r,
+                                         (transb == rocblas_operation_none) ? k : n,
+                                         (const void*)beta,
+                                         (void*)C,
+                                         rocblas_datatype_f16_r,
+                                         m,
+                                         (void*)C,
+                                         rocblas_datatype_f16_r,
+                                         m,
+                                         rocblas_datatype_f32_r,
+                                         algo,
+					 0,
+					 0);
+#else
     cublasStatus_t status = cublasGemmEx(handle,
                                          transa,
                                          transb,
@@ -77,8 +164,13 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                          m,
                                          CUDA_R_32F,
                                          algo);
+#endif
 
+#ifdef __HIP_PLATFORM_HCC__
+    if (status != rocblas_status_success) {
+#else
     if (status != CUBLAS_STATUS_SUCCESS) {
+#endif
         fprintf(stderr,
                 "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
                 m,
@@ -90,6 +182,24 @@ int cublas_gemm_ex(cublasHandle_t handle,
     return 0;
 }
 
+#ifdef __HIP_PLATFORM_HCC__
+int cublas_strided_batched_gemm(rocblas_handle handle,
+                                int m,
+                                int n,
+                                int k,
+                                const float* alpha,
+                                const float* beta,
+                                const float* A,
+                                const float* B,
+                                float* C,
+                                rocblas_operation op_A,
+                                rocblas_operation op_B,
+                                int stride_A,
+                                int stride_B,
+                                int stride_C,
+                                int batch,
+                                rocblas_gemm_algo algo)
+#else
 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int m,
                                 int n,
@@ -106,7 +216,39 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_C,
                                 int batch,
                                 cublasGemmAlgo_t algo)
+#endif
 {
+#ifdef __HIP_PLATFORM_HCC__
+    rocblas_status status = rocblas_gemm_strided_batched_ex(handle,
+                                                            op_A,
+                                                            op_B,
+                                                            m,
+                                                            n,
+                                                            k,
+                                                            alpha,
+                                                            A,
+                                                            rocblas_datatype_f32_r,
+                                                            (op_A == rocblas_operation_none) ? m : k,
+                                                            stride_A,
+                                                            B,
+                                                            rocblas_datatype_f32_r,
+                                                            (op_B == rocblas_operation_none) ? k : n,
+                                                            stride_B,
+                                                            beta,
+                                                            C,
+                                                            rocblas_datatype_f32_r,
+                                                            m,
+                                                            stride_C,
+                                                            C,
+                                                            rocblas_datatype_f32_r,
+                                                            m,
+                                                            stride_C,
+                                                            batch,
+                                                            rocblas_datatype_f32_r,
+                                                            algo,
+							    0,
+							    0);
+#else
     cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
                                                        op_A,
                                                        op_B,
@@ -130,8 +272,13 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                        batch,
                                                        CUDA_R_32F,
                                                        algo);
+#endif
 
+#ifdef __HIP_PLATFORM_HCC__
     if (status != CUBLAS_STATUS_SUCCESS) {
+#else
+    if (status != rocblas_status_success) {
+#endif
         fprintf(stderr,
                 "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
                 batch,
@@ -144,6 +291,24 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
     return 0;
 }
 
+#ifdef __HIP_PLATFORM_HCC__
+int cublas_strided_batched_gemm(rocblas_handle handle,
+                                int m,
+                                int n,
+                                int k,
+                                const float* alpha,
+                                const float* beta,
+                                const __half* A,
+                                const __half* B,
+                                __half* C,
+                                rocblas_operation op_A,
+                                rocblas_operation op_B,
+                                int stride_A,
+                                int stride_B,
+                                int stride_C,
+                                int batch,
+                                rocblas_gemm_algo algo)
+#else
 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int m,
                                 int n,
@@ -160,7 +325,39 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_C,
                                 int batch,
                                 cublasGemmAlgo_t algo)
+#endif
 {
+#ifdef __HIP_PLATFORM_HCC__
+    rocblas_status status = rocblas_gemm_strided_batched_ex(handle,
+                                                            op_A,
+                                                            op_B,
+                                                            m,
+                                                            n,
+                                                            k,
+                                                            alpha,
+                                                            A,
+                                                            rocblas_datatype_f16_r,
+                                                            (op_A == rocblas_operation_none) ? m : k,
+                                                            stride_A,
+                                                            B,
+                                                            rocblas_datatype_f16_r,
+                                                            (op_B == rocblas_operation_none) ? k : n,
+                                                            stride_B,
+                                                            beta,
+                                                            C,
+                                                            rocblas_datatype_f16_r,
+                                                            m,
+                                                            stride_C,
+                                                            C,
+                                                            rocblas_datatype_f16_r,
+                                                            m,
+                                                            stride_C,
+                                                            batch,
+                                                            rocblas_datatype_f32_r,
+                                                            algo,
+							    0,
+							    0);
+#else
     cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
                                                        op_A,
                                                        op_B,
@@ -184,8 +381,13 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                        batch,
                                                        CUDA_R_32F,
                                                        algo);
+#endif
 
+#ifdef __HIP_PLATFORM_HCC__
+    if (status != rocblas_status_success) {
+#else
     if (status != CUBLAS_STATUS_SUCCESS) {
+#endif
         fprintf(stderr,
                 "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
                 m,
diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp
index e729e15a3912..bc9cf59ec902 100644
--- a/csrc/transformer/ds_transformer_cuda.cpp
+++ b/csrc/transformer/ds_transformer_cuda.cpp
@@ -140,7 +140,9 @@ BertTransformerLayer<T>::~BertTransformerLayer()
 template <typename T>
 void BertTransformerLayer<T>::Initialize()
 {
+#ifndef __HIP_PLATFORM_HCC__
     if (std::is_same<T, __half>::value) cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
+#endif
 }
 
 template <typename T>
diff --git a/csrc/transformer/gelu_kernels.cu b/csrc/transformer/gelu_kernels.cu
index 12048006266e..dbb8828ce977 100644
--- a/csrc/transformer/gelu_kernels.cu
+++ b/csrc/transformer/gelu_kernels.cu
@@ -60,7 +60,7 @@ __global__ void gelu_kernel(const float* input, float* vals, int row_stride, int
 
 __global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
@@ -131,7 +131,7 @@ __global__ void fused_bias_gelu(const __half* input,
                                 int row_stride,
                                 int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
@@ -214,7 +214,7 @@ __global__ void d_gelu_func(__half* d_output,
                             int row_stride,
                             int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
diff --git a/csrc/transformer/inference/csrc/dequantize.cu b/csrc/transformer/inference/csrc/dequantize.cu
index ddf7a958822a..43d50f00c058 100644
--- a/csrc/transformer/inference/csrc/dequantize.cu
+++ b/csrc/transformer/inference/csrc/dequantize.cu
@@ -46,7 +46,7 @@ __global__ void dequantize_kernel(__half* output,
                                   unsigned groups,
                                   unsigned merge_count)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     unsigned merge_hidden = hidden_dim >> merge_count;
     unsigned quantization_stride = (merge_hidden * output_size) / groups;
diff --git a/csrc/transformer/inference/csrc/gelu.cu b/csrc/transformer/inference/csrc/gelu.cu
index fc3faacc54e8..1737855f614d 100755
--- a/csrc/transformer/inference/csrc/gelu.cu
+++ b/csrc/transformer/inference/csrc/gelu.cu
@@ -39,7 +39,7 @@ __global__ void fused_bias_gelu(__half* input,
                                 int total_count,
                                 int intermediate_size)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     float2* input_cast = reinterpret_cast<float2*>(input);
     const float2* bias_cast = reinterpret_cast<const float2*>(bias);
@@ -117,7 +117,7 @@ __global__ void fused_bias_add(float* input, const float* bias, int total_count,
 
 __global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     float2* input_cast = reinterpret_cast<float2*>(input);
     const float2* bias_cast = reinterpret_cast<const float2*>(bias);
@@ -195,7 +195,7 @@ __global__ void fused_bias_residual(__half* input,
                                     int total_count,
                                     int intermediate_size)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     float2* input_cast = reinterpret_cast<float2*>(input);
     const float2* residual_cast = reinterpret_cast<const float2*>(residual);
diff --git a/csrc/transformer/inference/csrc/normalize.cu b/csrc/transformer/inference/csrc/normalize.cu
index ecd73154f37f..dc0f6be01144 100755
--- a/csrc/transformer/inference/csrc/normalize.cu
+++ b/csrc/transformer/inference/csrc/normalize.cu
@@ -85,7 +85,7 @@ __global__ void fused_bias_residual_layer_norm(__half* output,
                                                float epsilon,
                                                int row_stride)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
@@ -287,7 +287,7 @@ __global__ void fused_residual_layer_norm(__half* norm,
                                           int row_stride,
                                           bool preLN)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int iteration_stride = blockDim.x;
 
     cg::thread_block b = cg::this_thread_block();
diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu
index cee509965106..b347945df636 100644
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@@ -37,7 +37,7 @@ __global__ void attn_softmax_v2(__half* vals,
                                 int num_seq,
                                 float scale)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
diff --git a/csrc/transformer/normalize_kernels.cu b/csrc/transformer/normalize_kernels.cu
index 366e93724638..c9bc4a46ee5e 100644
--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
@@ -121,7 +121,7 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
                                                __half* means,
                                                int row_stride)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
@@ -404,7 +404,7 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
                                                __half* vars,
                                                int row_stride)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu
index be776b0c074d..a4d84c37dd3b 100644
--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
@@ -142,7 +142,7 @@ __global__ void attn_softmax(__half* vals,
                              int seq_length,
                              int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     __shared__ float partialSum[MAX_WARP_NUM];
 
     int warp_num = blockDim.x >> 5;
diff --git a/csrc/transformer/transform_kernels.cu b/csrc/transformer/transform_kernels.cu
index 7d8a27eeeb43..b68d70f67ae1 100755
--- a/csrc/transformer/transform_kernels.cu
+++ b/csrc/transformer/transform_kernels.cu
@@ -96,7 +96,7 @@ __global__ void transform_0213<__half>(__half* output,
                                        int heads,
                                        int head_ext)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
@@ -219,7 +219,7 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
                                                 int heads,
                                                 int head_ext)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
@@ -289,7 +289,7 @@ __global__ void bias_add_transform_0213_v2(__half* output,
                                            int seq_length,
                                            int heads)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     __shared__ float4 in_data[3072];
 
     int d0_stride = hidden_dim * seq_length;
@@ -451,7 +451,7 @@ __global__ void transform4d_0213<__half>(__half* out,
                                          int hidden_dim,
                                          int head_ext)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     int d0_stride = hidden_dim * (seq_length / head_ext);
     int d1_stride = hidden_dim;
@@ -487,7 +487,7 @@ __global__ void transform4d_0213_v2(__half* out,
                                     int seq_length,
                                     int hidden_dim)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     __shared__ float4 in_data[3072];
 
     int d0_stride = hidden_dim * seq_length;
diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py
index b14ac4464835..7166104fde95 100644
--- a/deepspeed/env_report.py
+++ b/deepspeed/env_report.py
@@ -85,6 +85,8 @@ def debug_report():
          torch.__version__),
         ("torch cuda version",
          torch.version.cuda),
+        ("torch hip version",
+         torch.version.hip),
         ("nvcc version",
          nvcc_version()),
         ("deepspeed install path",
@@ -93,7 +95,7 @@ def debug_report():
          f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}"
          ),
         ("deepspeed wheel compiled w.",
-         f"torch {torch_info['version']}, cuda {torch_info['cuda_version']}"),
+         f"torch {torch_info['version']}, cuda {torch_info['cuda_version']}, hip {torch_info['hip_version']}"),
     ]
     print("DeepSpeed general environment info:")
     for name, value in report:
diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py
index f04982c74f0d..a806475c397b 100644
--- a/deepspeed/git_version_info.py
+++ b/deepspeed/git_version_info.py
@@ -14,4 +14,4 @@
     from .ops.op_builder import ALL_OPS
     installed_ops = dict.fromkeys(ALL_OPS.keys(), False)
     compatible_ops = dict.fromkeys(ALL_OPS.keys(), False)
-    torch_info = {'version': "0.0", "cuda_version": "0.0"}
+    torch_info = {'version': "0.0", "cuda_version": "0.0", "hip_version": "0.0"}
diff --git a/deepspeed/ops/__init__.py b/deepspeed/ops/__init__.py
index ea5bb7b046fa..dac5378a101c 100755
--- a/deepspeed/ops/__init__.py
+++ b/deepspeed/ops/__init__.py
@@ -1,6 +1,8 @@
 from . import adam
 from . import lamb
-from . import sparse_attention
+from ..git_version_info_installed import installed_ops as __installed_ops__
+if __installed_ops__['sparse_attn']:
+    from . import sparse_attention
 from . import transformer
 
 from .transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
new file mode 100644
index 000000000000..95f9233ff21e
--- /dev/null
+++ b/docker/Dockerfile.rocm
@@ -0,0 +1,183 @@
+FROM rocm/pytorch:latest
+
+
+##############################################################################
+# Temporary Installation Directory
+##############################################################################
+ENV STAGE_DIR=/tmp
+RUN mkdir -p ${STAGE_DIR}
+
+##############################################################################
+# Installation/Basic Utilities
+##############################################################################
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        software-properties-common build-essential autotools-dev \
+        nfs-common pdsh \
+        cmake g++ gcc \
+        curl wget vim tmux emacs less unzip \
+        htop iftop iotop ca-certificates openssh-client openssh-server \
+        rsync iputils-ping net-tools sudo \
+        llvm-9-dev
+
+##############################################################################
+# Installation Latest Git
+##############################################################################
+RUN add-apt-repository ppa:git-core/ppa -y && \
+    apt-get update && \
+    apt-get install -y git && \
+    git --version
+
+##############################################################################
+# Client Liveness & Uncomment Port 22 for SSH Daemon
+##############################################################################
+# Keep SSH client alive from server side
+RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config
+RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \
+    sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+
+##############################################################################
+# Mellanox OFED
+##############################################################################
+#ENV MLNX_OFED_VERSION=4.6-1.0.1.1
+#RUN apt-get install -y libnuma-dev
+#RUN cd ${STAGE_DIR} && \
+#    wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64.tgz | tar xzf - && \
+#    cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64 && \
+#    ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
+#    cd ${STAGE_DIR} && \
+#    rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64*
+
+##############################################################################
+# OPENMPI
+##############################################################################
+#ENV OPENMPI_BASEVERSION=4.0
+#ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.1
+#RUN cd ${STAGE_DIR} && \
+#    wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
+#    cd openmpi-${OPENMPI_VERSION} && \
+#    ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
+#    make -j"$(nproc)" install && \
+#    ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
+#    # Sanity check:
+#    test -f /usr/local/mpi/bin/mpic++ && \
+#    cd ${STAGE_DIR} && \
+#    rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+#ENV PATH=/usr/local/mpi/bin:${PATH} \
+#    LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+## Create a wrapper for OpenMPI to allow running as root by default
+#RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
+#    echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
+#    echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
+#    chmod a+x /usr/local/mpi/bin/mpirun
+
+##############################################################################
+# Python
+##############################################################################
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHON_VERSION=3.6
+RUN apt-get install -y python3.6 python3.6-dev && \
+    rm -f /usr/bin/python && \
+    ln -s /usr/bin/python3.6 /usr/bin/python && \
+    curl -O https://bootstrap.pypa.io/get-pip.py && \
+        python get-pip.py && \
+        rm get-pip.py && \
+    pip install --upgrade pip && \
+    # Print python an pip version
+    python -V && pip -V
+RUN pip install pyyaml
+RUN pip install ipython
+
+##############################################################################
+# TensorFlow
+##############################################################################
+RUN pip install tensorflow-rocm
+
+##############################################################################
+# Some Packages
+##############################################################################
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        libsndfile-dev \
+        libjpeg-dev \
+        libpng-dev \
+        screen
+RUN pip install psutil \
+                yappi \
+                cffi \
+                ipdb \
+                pandas \
+                matplotlib \
+                py3nvml \
+                pyarrow \
+                graphviz \
+                astor \
+                boto3 \
+                tqdm \
+                sentencepiece \
+                msgpack \
+                requests \
+                pandas \
+                sphinx \
+                sphinx_rtd_theme \
+                scipy \
+                numpy \
+                sklearn \
+                scikit-learn \
+                mpi4py \
+                h5py 
+
+##############################################################################
+## SSH daemon port inside container cannot conflict with host OS port
+###############################################################################
+ENV SSH_PORT=2222
+RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
+    sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+
+##############################################################################
+# PyTorch
+##############################################################################
+#ENV PYTORCH_VERSION=1.2.0
+#ENV TORCHVISION_VERSION=0.4.0
+#ENV TENSORBOARDX_VERSION=1.8
+#RUN pip install torch==${PYTORCH_VERSION}
+#RUN pip install torchvision==${TORCHVISION_VERSION}
+#RUN pip install tensorboardX==${TENSORBOARDX_VERSION}
+
+##############################################################################
+# PyYAML build issue
+# https://stackoverflow.com/a/53926898
+##############################################################################
+RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
+        rm -rf /usr/lib/python3/dist-packages/PyYAML-*
+
+##############################################################################
+## CuPy installation
+###############################################################################
+RUN git clone https://github.com/ROCmSoftwarePlatform/cupy ${STAGE_DIR}/cupy
+RUN cd ${STAGE_DIR}/cupy && \
+        git submodule update --init && \
+        CUPY_INSTALL_USE_HIP=1 ROCM_HOME=/opt/rocm pip install -e . --no-cache-dir -vvvv
+RUN rm -rf ${STAGE_DIR}/cupy
+
+##############################################################################
+## Add deepspeed user
+###############################################################################
+# Add a deepspeed user with user id 8877
+#RUN useradd --create-home --uid 8877 deepspeed
+#RUN useradd --create-home --uid 1000 --shell /bin/bash deepspeed
+#RUN usermod -aG sudo deepspeed
+#RUN echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+# # Change to non-root privilege
+#USER deepspeed
+
+##############################################################################
+# DeepSpeed
+##############################################################################
+RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+RUN cd ${STAGE_DIR}/DeepSpeed && \
+        git checkout . && \
+        git checkout master && \
+        DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
+RUN rm -rf ${STAGE_DIR}/DeepSpeed
+RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)"
diff --git a/install.sh b/install.sh
index 7c26883d6db0..a03455efa2e3 100755
--- a/install.sh
+++ b/install.sh
@@ -156,7 +156,7 @@ python setup.py $VERBOSE bdist_wheel
 
 if [ "$local_only" == "1" ]; then
     echo "Installing deepspeed"
-    $PIP_SUDO pip uninstall -y deepspeed
+#    $PIP_SUDO pip uninstall -y deepspeed
     $PIP_SUDO $PIP_INSTALL dist/deepspeed*.whl
     ds_report
 else
@@ -171,7 +171,11 @@ else
     tmp_wheel_path="/tmp/deepspeed_wheels"
 
     pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*; else mkdir -pv $tmp_wheel_path; fi"
-    pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/
+    if [ -e "/opt/rocm" ]; then
+        pdcp -w $hosts requirements/requirements-rocm.txt ${tmp_wheel_path}/
+    else
+        pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/
+    fi
 
     echo "Installing deepspeed"
     pdsh -w $hosts "$PIP_SUDO pip uninstall -y deepspeed"
diff --git a/op_builder/__init__.py b/op_builder/__init__.py
index f19ed916c332..bf315489044e 100755
--- a/op_builder/__init__.py
+++ b/op_builder/__init__.py
@@ -9,7 +9,7 @@
 from .stochastic_transformer import StochasticTransformerBuilder
 from .utils import UtilsBuilder
 from .async_io import AsyncIOBuilder
-from .builder import get_default_compute_capatabilities
+from .builder import get_default_compute_capatabilities, is_rocm_pytorch
 from .transformer_inference import InferenceBuilder
 from .quantizer import QuantizerBuilder
 
diff --git a/op_builder/builder.py b/op_builder/builder.py
index e061072a88b9..f2a96534d815 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -17,6 +17,10 @@
 DEFAULT_TORCH_EXTENSION_PATH = "/tmp/torch_extensions"
 DEFAULT_COMPUTE_CAPABILITIES = "6.0;6.1;7.0"
 
+is_rocm_pytorch = False
+if torch.__version__ >= '1.5':
+    from torch.utils.cpp_extension import ROCM_HOME
+    is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False
 
 def installed_cuda_version():
     import torch.utils.cpp_extension
@@ -66,17 +70,31 @@ def assert_no_cuda_mismatch():
 def assert_torch_info(torch_info):
     install_torch_version = torch_info['version']
     install_cuda_version = torch_info['cuda_version']
+    install_hip_version = torch_info['hip_version']
+
+    if not is_rocm_pytorch:
+        current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+    else:
+        current_hip_version = ".".join(torch.version.hip.split('.')[:2])
 
-    current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
     current_torch_version = ".".join(torch.__version__.split('.')[:2])
 
-    if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
-        raise RuntimeError(
-            "PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
-            "with a different version than what is being used at runtime. Please re-install "
-            f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
-            f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
-            f"torch={current_torch_version}, cuda={current_cuda_version}")
+    if not is_rocm_pytorch:
+        if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
+            raise RuntimeError(
+                "PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
+                "with a different version than what is being used at runtime. Please re-install "
+                f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
+                f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
+                f"torch={current_torch_version}, cuda={current_cuda_version}")
+    else:
+        if install_hip_version != current_hip_version or install_torch_version != current_torch_version:
+            raise RuntimeError(
+                "PyTorch and HIP version mismatch! DeepSpeed ops were compiled and installed "
+                "with a different version than what is being used at runtime. Please re-install "
+                f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
+                f"torch={install_torch_version}, hip={install_hip_version}, runtime versions:"
+                f"torch={current_torch_version}, hip={current_hip_version}")
 
 
 class OpBuilder(ABC):
@@ -227,7 +245,7 @@ def jit_load(self, verbose=True):
                 f"Unable to JIT load the {self.name} op due to ninja not being installed."
             )
 
-        if isinstance(self, CUDAOpBuilder):
+        if isinstance(self, CUDAOpBuilder) and not is_rocm_pytorch:
             assert_no_cuda_mismatch()
 
         self.jit_mode = True
@@ -241,9 +259,10 @@ def jit_load(self, verbose=True):
         os.makedirs(ext_path, exist_ok=True)
 
         start_build = time.time()
+        sources = [self.deepspeed_src_path(path) for path in self.sources()]
         op_module = load(
             name=self.name,
-            sources=[self.deepspeed_src_path(path) for path in self.sources()],
+            sources=sources,
             extra_include_paths=[
                 self.deepspeed_src_path(path) for path in self.include_paths()
             ],
@@ -331,7 +350,8 @@ def is_compatible(self):
 
     def builder(self):
         from torch.utils.cpp_extension import CUDAExtension
-        assert_no_cuda_mismatch()
+        if not is_rocm_pytorch:
+            assert_no_cuda_mismatch()
         return CUDAExtension(name=self.absolute_name(),
                              sources=self.sources(),
                              include_dirs=self.include_paths(),
diff --git a/op_builder/cpu_adam.py b/op_builder/cpu_adam.py
index 75fa042613fc..d5a8216d2095 100644
--- a/op_builder/cpu_adam.py
+++ b/op_builder/cpu_adam.py
@@ -5,7 +5,7 @@
 import sys
 import torch
 import subprocess
-from .builder import CUDAOpBuilder
+from .builder import CUDAOpBuilder, is_rocm_pytorch
 
 
 class CPUAdamBuilder(CUDAOpBuilder):
@@ -26,8 +26,15 @@ def sources(self):
         return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/custom_cuda_kernel.cu']
 
     def include_paths(self):
-        CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
-        return ['csrc/includes', CUDA_INCLUDE]
+        if not is_rocm_pytorch:
+            CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
+        else:
+            CUDA_INCLUDE = [
+                            os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"),
+                            os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"), 
+                            os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"),
+            ]
+        return ['csrc/includes'] + CUDA_INCLUDE
 
     def simd_width(self):
         if not self.command_exists('lscpu'):
@@ -40,14 +47,17 @@ def simd_width(self):
         result = subprocess.check_output('lscpu', shell=True)
         result = result.decode('utf-8').strip().lower()
         if 'genuineintel' in result:
-            if 'avx512' in result:
+            if not is_rocm_pytorch and 'avx512' in result:
                 return '-D__AVX512__'
             elif 'avx2' in result:
                 return '-D__AVX256__'
         return '-D__SCALAR__'
 
     def cxx_args(self):
-        CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
+        if not is_rocm_pytorch:
+            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
+        else:
+            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
         SIMD_WIDTH = self.simd_width()
 
         return [
@@ -66,11 +76,20 @@ def cxx_args(self):
     def nvcc_args(self):
         args = [
             '-O3',
-            '--use_fast_math',
-            '-std=c++14',
-            '-U__CUDA_NO_HALF_OPERATORS__',
-            '-U__CUDA_NO_HALF_CONVERSIONS__',
-            '-U__CUDA_NO_HALF2_OPERATORS__'
+            '-std=c++14'
         ]
-        args += self.compute_capability_args()
+        if is_rocm_pytorch:
+            args += [
+                '-U__HIP_NO_HALF_OPERATORS__',
+                '-U__HIP_NO_HALF_CONVERSIONS__',
+                '-U__HIP_NO_HALF2_OPERATORS__'
+            ]
+        else:
+            args += [
+                '--use_fast_math',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__',
+                '-U__CUDA_NO_HALF2_OPERATORS__'
+            ]
+            args += self.compute_capability_args()
         return args
diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py
index 0340ed02a8fb..6fa240ac3a3f 100644
--- a/op_builder/fused_adam.py
+++ b/op_builder/fused_adam.py
@@ -2,7 +2,7 @@
 Copyright 2020 The Microsoft DeepSpeed Team
 """
 import torch
-from .builder import CUDAOpBuilder
+from .builder import CUDAOpBuilder, is_rocm_pytorch
 
 
 class FusedAdamBuilder(CUDAOpBuilder):
@@ -19,14 +19,14 @@ def sources(self):
         return ['csrc/adam/fused_adam_frontend.cpp', 'csrc/adam/multi_tensor_adam.cu']
 
     def include_paths(self):
-        return ['csrc/includes']
+        return ['csrc/includes', 'csrc/adam']
 
     def cxx_args(self):
         args = super().cxx_args()
         return args + self.version_dependent_macros()
 
     def nvcc_args(self):
-        return ['-lineinfo',
-                '-O3',
-                '--use_fast_math'
-                ] + self.version_dependent_macros() + self.compute_capability_args()
+        nvcc_flags=['-O3'] + self.version_dependent_macros()
+        if not is_rocm_pytorch:
+            nvcc_flags.extend(['-lineinfo', '--use_fast_math'] + self.compute_capability_args())
+        return nvcc_flags
diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py
index 4c73ecb404e3..cb60412baea7 100644
--- a/op_builder/fused_lamb.py
+++ b/op_builder/fused_lamb.py
@@ -2,7 +2,7 @@
 Copyright 2020 The Microsoft DeepSpeed Team
 """
 import torch
-from .builder import CUDAOpBuilder
+from .builder import CUDAOpBuilder, is_rocm_pytorch
 
 
 class FusedLambBuilder(CUDAOpBuilder):
@@ -26,7 +26,7 @@ def cxx_args(self):
         return args + self.version_dependent_macros()
 
     def nvcc_args(self):
-        return ['-lineinfo',
-                '-O3',
-                '--use_fast_math'
-                ] + self.version_dependent_macros() + self.compute_capability_args()
+        nvcc_flags=['-O3'] + self.version_dependent_macros()
+        if not is_rocm_pytorch:
+            nvcc_flags.extend(['-lineinfo', '--use_fast_math'] + self.compute_capability_args())
+        return nvcc_flags
diff --git a/op_builder/transformer.py b/op_builder/transformer.py
index a94c6da62e39..606d0be255ef 100644
--- a/op_builder/transformer.py
+++ b/op_builder/transformer.py
@@ -2,7 +2,7 @@
 Copyright 2020 The Microsoft DeepSpeed Team
 """
 import torch
-from .builder import CUDAOpBuilder
+from .builder import CUDAOpBuilder, is_rocm_pytorch
 
 
 class TransformerBuilder(CUDAOpBuilder):
@@ -29,16 +29,29 @@ def sources(self):
         ]
 
     def include_paths(self):
-        return ['csrc/includes']
+        includes = ['csrc/includes']
+        if is_rocm_pytorch:
+            from torch.utils.cpp_extension import ROCM_HOME
+            includes += ['{}/hiprand/include'.format(ROCM_HOME), '{}/rocrand/include'.format(ROCM_HOME)]
+        return includes
 
     def nvcc_args(self):
         args = [
             '-O3',
-            '--use_fast_math',
             '-std=c++14',
-            '-U__CUDA_NO_HALF_OPERATORS__',
-            '-U__CUDA_NO_HALF_CONVERSIONS__',
-            '-U__CUDA_NO_HALF2_OPERATORS__'
         ]
-
-        return args + self.compute_capability_args()
+        if is_rocm_pytorch:
+            args += [
+                '-U__HIP_NO_HALF_OPERATORS__',
+                '-U__HIP_NO_HALF_CONVERSIONS__',
+                '-U__HIP_NO_HALF2_OPERATORS__'
+            ]
+        else:
+            args += [
+                '--use_fast_math',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__',
+                '-U__CUDA_NO_HALF2_OPERATORS__'
+            ]
+            args += self.compute_capability_args()
+        return args
diff --git a/requirements/requirements-rocm.txt b/requirements/requirements-rocm.txt
new file mode 100644
index 000000000000..e0e1d0c482ce
--- /dev/null
+++ b/requirements/requirements-rocm.txt
@@ -0,0 +1,7 @@
+#torch>=1.2
+#torchvision>=0.4.0
+tqdm
+tensorboardX==1.8
+ninja
+numpy
+psutil
diff --git a/setup.py b/setup.py
index 654b983eec81..27374a7f748b 100755
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@
     raise ImportError('Unable to import torch, please visit https://pytorch.org/ '
                       'to see how to properly install torch on your system.')
 
-from op_builder import ALL_OPS, get_default_compute_capatabilities
+from op_builder import ALL_OPS, get_default_compute_capatabilities, is_rocm_pytorch
 
 
 def fetch_requirements(path):
@@ -38,6 +38,9 @@ def fetch_requirements(path):
 
 
 install_requires = fetch_requirements('requirements/requirements.txt')
+if is_rocm_pytorch:
+    print("NOTE: Please manually install torch and torchvision packages for ROCm")
+    install_requires = fetch_requirements('requirements/requirements-rocm.txt')
 extras_require = {
     '1bit_adam': fetch_requirements('requirements/requirements-1bit-adam.txt'),
     'readthedocs': fetch_requirements('requirements/requirements-readthedocs.txt'),
@@ -47,7 +50,10 @@ def fetch_requirements(path):
 # If MPI is available add 1bit-adam requirements
 if torch.cuda.is_available():
     if shutil.which('ompi_info') or shutil.which('mpiname'):
-        cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
+        if is_rocm_pytorch:
+            cupy = "cupy"
+        else:
+            cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
         extras_require['1bit_adam'].append(cupy)
 
 # Make an [all] extra that installs all needed dependencies
@@ -170,9 +176,13 @@ def create_dir_symlink(src, dest):
 torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
 # Set cuda_version to 0.0 if cpu-only
 cuda_version = "0.0"
+# Set hip_version to 0.0 if cpu-only
+hip_version = "0.0"
 if torch.version.cuda is not None:
     cuda_version = ".".join(torch.version.cuda.split('.')[:2])
-torch_info = {"version": torch_version, "cuda_version": cuda_version}
+if torch.version.hip is not None:
+    hip_version = ".".join(torch.version.hip.split('.')[:2])
+torch_info = {"version": torch_version, "cuda_version": cuda_version, "hip_version": hip_version}
 
 print(f"version={version_str}, git_hash={git_hash}, git_branch={git_branch}")
 with open('deepspeed/git_version_info_installed.py', 'w') as fd:
@@ -206,7 +216,7 @@ def create_dir_symlink(src, dest):
       extras_require=extras_require,
       packages=find_packages(exclude=["docker",
                                       "third_party"]),
-      include_package_data=True,
+#      include_package_data=True, #FIXME
       scripts=[
           'bin/deepspeed',
           'bin/deepspeed.pt',
diff --git a/tests/unit/common.py b/tests/unit/common.py
index f92b1058aa92..316fcf227232 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -8,10 +8,24 @@
 import deepspeed
 
 import pytest
+from functools import wraps
+import unittest
 
 # Worker timeout *after* the first worker has completed.
 DEEPSPEED_UNIT_WORKER_TIMEOUT = 120
 
+TEST_WITH_ROCM = os.getenv('DEEPSPEED_TEST_WITH_ROCM', '0') == '1'
+
+def skipIfRocm(reason="test doesn't currently work on the ROCm stack"):
+    def decorator(fn):
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            if TEST_WITH_ROCM:
+                raise unittest.SkipTest(reason)
+            else:
+                fn(*args, **kwargs)
+        return wrapper
+    return decorator
 
 def distributed_test(world_size=2, backend='nccl'):
     """A decorator for executing a function (e.g., a unit test) in a distributed manner.
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index 01004a0fa867..bf207fcb80b2 100755
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -3,7 +3,7 @@
 import pytest
 import json
 import argparse
-from common import distributed_test
+from common import distributed_test, skipIfRocm
 from simple_model import SimpleModel, create_config_from_dict, random_dataloader
 import torch.distributed as dist
 
@@ -56,6 +56,7 @@ def _batch_assert(status, ds_config, batch, micro_batch, gas, success):
                          (2,32,8,2,True),
                          (2,33,17,2,False),
                          (2,32,18,1,False)]) # yapf: disable
+@skipIfRocm()
 def test_batch_config(num_ranks, batch, micro_batch, gas, success):
     @distributed_test(world_size=2)
     def _test_batch_config(num_ranks, batch, micro_batch, gas, success):
diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py
index 62ccdbdc68d9..fbc70447a39d 100755
--- a/tests/unit/test_cuda_backward.py
+++ b/tests/unit/test_cuda_backward.py
@@ -8,6 +8,7 @@
 import time
 import copy
 from torch import nn
+from common import skipIfRocm
 from modelingpreln import BertEncoder as BertEncoderPreln
 from modeling import BertEncoder as BertEncoderPostln
 from modeling import BertConfig, BertLayerNorm
@@ -268,6 +269,7 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
                              #(3,128,51,2,24,False,False, 0.1),
                              #(3,128,54,2,24,False,True, 0.2),
                          ]) # yapf: disable
+@skipIfRocm()
 def test_backward(batch_size,
                   hidden_size,
                   seq_len,
diff --git a/tests/unit/test_dist.py b/tests/unit/test_dist.py
index 04b97031b3e5..18a74b0a16fd 100644
--- a/tests/unit/test_dist.py
+++ b/tests/unit/test_dist.py
@@ -5,7 +5,6 @@
 
 import pytest
 
-
 @distributed_test(world_size=3)
 def test_init():
     assert dist.is_initialized()
diff --git a/tests/unit/test_onebit.py b/tests/unit/test_onebit.py
index 9796a70953f8..919042d04c7c 100644
--- a/tests/unit/test_onebit.py
+++ b/tests/unit/test_onebit.py
@@ -14,7 +14,7 @@
 from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology
 PipeTopo = PipeDataParallelTopology
 from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec
-from common import distributed_test
+from common import distributed_test, skipIfRocm
 from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict, create_deepspeed_args
 from test_pipe import AlexNetPipe, train_cifar
 
@@ -25,6 +25,7 @@
                 allow_module_level=True)
 
 
+@skipIfRocm("Skipped for now as cupy is not available on ROCm")
 def test_onebitadam_fp16_basic(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -68,6 +69,7 @@ def _test_onebitadam_fp16_basic(args, model, hidden_dim):
     _test_onebitadam_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@skipIfRocm("Skipped for now as cupy is not available on ROCm")
 def test_onebitadam_fp32_basic(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -107,6 +109,7 @@ def _test_onebitadam_fp32_basic(args, model, hidden_dim):
     _test_onebitadam_fp32_basic(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@skipIfRocm("Skipped for now as cupy is not available on ROCm")
 def test_onebitadam_exp_avg_mask(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -168,6 +171,7 @@ def _test_onebitadam_exp_avg_mask(args, model, hidden_dim):
     _test_onebitadam_exp_avg_mask(args=args, model=model, hidden_dim=hidden_dim)
 
 
+@skipIfRocm("Skipped for now as cupy is not available on ROCm")
 def test_onebitadam_checkpointing(tmpdir):
     config_dict = {
         "train_batch_size": 2,
@@ -849,6 +853,7 @@ def _helper(topo, tmpdir, steps=500):
     _helper(topo, tmpdir)
 
 
+@skipIfRocm("Skipped for now as cupy is not available on ROCm")
 def test_compressed_allreduce_basic(tmpdir):
     @distributed_test(world_size=[1, 2])
     def _test_compressed_allreduce_basic():
diff --git a/tests/unit/test_pipe_module.py b/tests/unit/test_pipe_module.py
index a29d22a2a954..af87546c4ee3 100644
--- a/tests/unit/test_pipe_module.py
+++ b/tests/unit/test_pipe_module.py
@@ -14,7 +14,7 @@
 from deepspeed.pipe import PipelineModule, LayerSpec
 from deepspeed.utils import RepeatingLoader
 
-from common import distributed_test
+from common import distributed_test, skipIfRocm
 from simple_model import args_from_dict
 
 HIDDEN_DIM = 32
@@ -56,6 +56,7 @@ def simple_args(tmpdir):
     return args
 
 
+@skipIfRocm()
 def test_pipe_module_sequential(sequential_model, simple_args):
     batch_input = torch.randn(1, HIDDEN_DIM)
 
diff --git a/tests/unit/test_topology.py b/tests/unit/test_topology.py
index 176363688de4..e907af06427c 100644
--- a/tests/unit/test_topology.py
+++ b/tests/unit/test_topology.py
@@ -7,7 +7,7 @@
 from deepspeed.runtime.pipe.topology import ProcessTopology as Topo
 from deepspeed.runtime.pipe.topology import _prime_factors
 
-from common import distributed_test
+from common import distributed_test, skipIfRocm
 
 
 def test_topology_2d():
@@ -157,6 +157,7 @@ def test_topology_comm_list():
     assert topo.get_axis_comm_lists('jeff') == []
 
 
+@skipIfRocm()
 @distributed_test(world_size=4)
 def test_grid_pipe_data():
     topo = Topo(axes=['pipe', 'data'], dims=[2, 2])
@@ -182,7 +183,6 @@ def test_grid_pipe_data():
     data_group = grid.dp_group
     assert torch.all(rank_tensor == sum(data_group))
 
-
 @distributed_test(world_size=4)
 def test_stage_to_global():
     topo = Topo(axes=['pipe', 'data'], dims=[2, 2])