diff --git a/apex/contrib/csrc/layer_norm/hip_bfloat162.h b/apex/contrib/csrc/layer_norm/hip_bfloat162.h
new file mode 100644
index 000000000..2d1d79f8b
--- /dev/null
+++ b/apex/contrib/csrc/layer_norm/hip_bfloat162.h
@@ -0,0 +1,287 @@
+/**
+ * MIT License
+ *
+ * Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*!\file
+ * \brief hip_bfloat162.h provides struct for hip_bfloat162 typedef
+ */
+
+#ifndef _hip_bfloat162_H_
+#define _hip_bfloat162_H_
+
+#if __cplusplus < 201103L || !defined(__HIPCC__)
+
+// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
+// include a minimal definition of hip_bfloat162
+
+#include <stdint.h>
+/*! \brief Struct to represent a 16 bit brain floating point number. */
+typedef struct
+{
+    uint16_t data;
+    uint16_t data2;
+} hip_bfloat162;
+
+#else // __cplusplus < 201103L || !defined(__HIPCC__)
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <hip/hip_runtime.h>
+#include <hip/hip_bfloat16.h>
+#include <ostream>
+#include <type_traits>
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wshadow"
+struct hip_bfloat162
+{
+    uint16_t data;
+    uint16_t data2;
+
+    enum truncate_t
+    {
+        truncate
+    };
+
+    __host__ __device__ hip_bfloat162() = default;
+
+    // round upper 16 bits of IEEE float to convert to bfloat16
+    explicit __host__ __device__ hip_bfloat162(float f)
+        : data(float_to_bfloat16(f))
+    {
+    }
+
+    explicit __host__ __device__ hip_bfloat162(float f, truncate_t)
+        : data(truncate_float_to_bfloat16(f))
+    {
+    }
+
+    // zero extend lower 16 bits of bfloat16 to convert to IEEE float
+    __host__ __device__ operator float() const
+    {
+        union
+        {
+            uint32_t int32;
+            float    fp32;
+        } u = {uint32_t(data) << 16};
+        return u.fp32;
+    }
+
+    static  __host__ __device__ hip_bfloat162 round_to_bfloat16(float f)
+    {
+        hip_bfloat162 output;
+        output.data = float_to_bfloat16(f);
+        return output;
+    }
+
+    static  __host__ __device__ hip_bfloat162 round_to_bfloat16(float f, truncate_t)
+    {
+        hip_bfloat162 output;
+        output.data = truncate_float_to_bfloat16(f);
+        return output;
+    }
+
+private:
+    static __host__ __device__ uint16_t float_to_bfloat16(float f)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {f};
+        if(~u.int32 & 0x7f800000)
+        {
+            // When the exponent bits are not all 1s, then the value is zero, normal,
+            // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+            // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+            // This causes the bfloat16's mantissa to be incremented by 1 if the 16
+            // least significant bits of the float mantissa are greater than 0x8000,
+            // or if they are equal to 0x8000 and the least significant bit of the
+            // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+            // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+            // has the value 0x7f, then incrementing it causes it to become 0x00 and
+            // the exponent is incremented by one, which is the next higher FP value
+            // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+            // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
+            // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+            // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+            // incrementing it causes it to become an exponent of 0xFF and a mantissa
+            // of 0x00, which is Inf, the next higher value to the unrounded value.
+            u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
+        }
+        else if(u.int32 & 0xffff)
+        {
+            // When all of the exponent bits are 1, the value is Inf or NaN.
+            // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+            // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+            // bit being 1. Signaling NaN is indicated by the most significant
+            // mantissa bit being 0 but some other bit(s) being 1. If any of the
+            // lower 16 bits of the mantissa are 1, we set the least significant bit
+            // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+            // the bloat16's mantissa bits are all 0.
+            u.int32 |= 0x10000; // Preserve signaling NaN
+        }
+        return uint16_t(u.int32 >> 16);
+    }
+
+    // Truncate instead of rounding, preserving SNaN
+    static __host__ __device__ uint16_t truncate_float_to_bfloat16(float f)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {f};
+        return uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
+    }
+};
+#pragma clang diagnostic pop
+
+typedef struct
+{
+    uint16_t data;
+    uint16_t data2;
+} hip_bfloat162_public;
+
+static_assert(std::is_standard_layout<hip_bfloat162>{},
+              "hip_bfloat162 is not a standard layout type, and thus is "
+              "incompatible with C.");
+
+static_assert(std::is_trivial<hip_bfloat162>{},
+              "hip_bfloat162 is not a trivial type, and thus is "
+              "incompatible with C.");
+
+static_assert(sizeof(hip_bfloat162) == sizeof(hip_bfloat162_public)
+                  && offsetof(hip_bfloat162, data) == offsetof(hip_bfloat162_public, data),
+              "internal hip_bfloat162 does not match public hip_bfloat162");
+
+inline std::ostream& operator<<(std::ostream& os, const hip_bfloat162& bf16)
+{
+    return os << float(bf16);
+}
+inline __host__ __device__ hip_bfloat162 operator+(hip_bfloat162 a)
+{
+    return a;
+}
+inline __host__ __device__ hip_bfloat162 operator-(hip_bfloat162 a)
+{
+    a.data ^= 0x8000;
+    return a;
+}
+inline __host__ __device__ hip_bfloat162 operator+(hip_bfloat162 a, hip_bfloat162 b)
+{
+    return hip_bfloat162(float(a) + float(b));
+}
+inline __host__ __device__ hip_bfloat162 operator-(hip_bfloat162 a, hip_bfloat162 b)
+{
+    return hip_bfloat162(float(a) - float(b));
+}
+inline __host__ __device__ hip_bfloat162 operator*(hip_bfloat162 a, hip_bfloat162 b)
+{
+    return hip_bfloat162(float(a) * float(b));
+}
+inline __host__ __device__ hip_bfloat162 operator/(hip_bfloat162 a, hip_bfloat162 b)
+{
+    return hip_bfloat162(float(a) / float(b));
+}
+inline __host__ __device__ bool operator<(hip_bfloat162 a, hip_bfloat162 b)
+{
+    return float(a) < float(b);
+}
+inline __host__ __device__ bool operator==(hip_bfloat162 a, hip_bfloat162 b)
+{
+    return float(a) == float(b);
+}
+inline __host__ __device__ bool operator>(hip_bfloat162 a, hip_bfloat162 b)
+{
+    return b < a;
+}
+inline __host__ __device__ bool operator<=(hip_bfloat162 a, hip_bfloat162 b)
+{
+    return !(a > b);
+}
+inline __host__ __device__ bool operator!=(hip_bfloat162 a, hip_bfloat162 b)
+{
+    return !(a == b);
+}
+inline __host__ __device__ bool operator>=(hip_bfloat162 a, hip_bfloat162 b)
+{
+    return !(a < b);
+}
+inline __host__ __device__ hip_bfloat162& operator+=(hip_bfloat162& a, hip_bfloat162 b)
+{
+    return a = a + b;
+}
+inline __host__ __device__ hip_bfloat162& operator-=(hip_bfloat162& a, hip_bfloat162 b)
+{
+    return a = a - b;
+}
+inline __host__ __device__ hip_bfloat162& operator*=(hip_bfloat162& a, hip_bfloat162 b)
+{
+    return a = a * b;
+}
+inline __host__ __device__ hip_bfloat162& operator/=(hip_bfloat162& a, hip_bfloat162 b)
+{
+    return a = a / b;
+}
+inline __host__ __device__ hip_bfloat162& operator++(hip_bfloat162& a)
+{
+    return a += hip_bfloat162(1.0f);
+}
+inline __host__ __device__ hip_bfloat162& operator--(hip_bfloat162& a)
+{
+    return a -= hip_bfloat162(1.0f);
+}
+inline __host__ __device__ hip_bfloat162 operator++(hip_bfloat162& a, int)
+{
+    hip_bfloat162 orig = a;
+    ++a;
+    return orig;
+}
+inline __host__ __device__ hip_bfloat162 operator--(hip_bfloat162& a, int)
+{
+    hip_bfloat162 orig = a;
+    --a;
+    return orig;
+}
+
+namespace std
+{
+    constexpr __host__ __device__ bool isinf(hip_bfloat162 a)
+    {
+        return !(~a.data & 0x7f80) && !(a.data & 0x7f);
+    }
+    constexpr __host__ __device__ bool isnan(hip_bfloat162 a)
+    {
+        return !(~a.data & 0x7f80) && +(a.data & 0x7f);
+    }
+    constexpr __host__ __device__ bool iszero(hip_bfloat162 a)
+    {
+        return !(a.data & 0x7fff);
+    }
+}
+
+#endif // __cplusplus < 201103L || !defined(__HIPCC__)
+
+#endif // _hip_bfloat162_H_
diff --git a/apex/contrib/csrc/layer_norm/ln.h b/apex/contrib/csrc/layer_norm/ln.h
index 07392a192..415a9acd4 100644
--- a/apex/contrib/csrc/layer_norm/ln.h
+++ b/apex/contrib/csrc/layer_norm/ln.h
@@ -1,8 +1,13 @@
 #pragma once
 
+#include <functional>
 #include <unordered_map>
 #include <cuda_fp16.h>
+#ifdef USE_ROCM
+#include <hip/hip_bfloat16.h>
+#else
 #include <cuda_bf16.h>
+#endif
 
 namespace layer_norm {
 
@@ -121,7 +126,11 @@ extern BwdRegistry BWD_FUNCS;
 
 using fp32 = float;
 using fp16 = half;
+#ifdef USE_ROCM
+using bf16 = hip_bfloat16;
+#else
 using bf16 = nv_bfloat16;
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/apex/contrib/csrc/layer_norm/ln_api.cpp b/apex/contrib/csrc/layer_norm/ln_api.cpp
index 30e4a5fec..90c2245b5 100644
--- a/apex/contrib/csrc/layer_norm/ln_api.cpp
+++ b/apex/contrib/csrc/layer_norm/ln_api.cpp
@@ -89,9 +89,9 @@ std::vector<at::Tensor> ln_fwd(const at::Tensor &x,      // BxSxhidden_size
 
     TORCH_CHECK(beta.scalar_type() == wtype);
 
-    TORCH_CHECK(x.is_cuda())
-    TORCH_CHECK(gamma.is_cuda())
-    TORCH_CHECK(beta.is_cuda())
+    //TORCH_CHECK(x.is_cuda())
+    //TORCH_CHECK(gamma.is_cuda())
+    //TORCH_CHECK(beta.is_cuda())
 
     TORCH_CHECK(x.is_contiguous());
     auto sizes = x.sizes();
@@ -170,11 +170,11 @@ std::vector<at::Tensor> ln_bwd(const at::Tensor &dz,     // BxSxhidden_size
     TORCH_CHECK(mu.dtype() == ctype);
     TORCH_CHECK(rsigma.dtype() == ctype);
 
-    TORCH_CHECK(x.is_cuda());
-    TORCH_CHECK(dz.is_cuda());
-    TORCH_CHECK(mu.is_cuda());
-    TORCH_CHECK(rsigma.is_cuda());
-    TORCH_CHECK(gamma.is_cuda());
+    //TORCH_CHECK(x.is_cuda());
+    //TORCH_CHECK(dz.is_cuda());
+    //TORCH_CHECK(mu.is_cuda());
+    //TORCH_CHECK(rsigma.is_cuda());
+    //TORCH_CHECK(gamma.is_cuda());
 
     TORCH_CHECK(x.is_contiguous());
     TORCH_CHECK(dz.is_contiguous());
diff --git a/apex/contrib/csrc/layer_norm/ln_bwd_kernels.cuh b/apex/contrib/csrc/layer_norm/ln_bwd_kernels.cuh
index 8595f5ed4..9a35ce4e0 100644
--- a/apex/contrib/csrc/layer_norm/ln_bwd_kernels.cuh
+++ b/apex/contrib/csrc/layer_norm/ln_bwd_kernels.cuh
@@ -18,6 +18,7 @@ void ln_bwd_kernel(layer_norm::BwdParams params) {
     enum { CTAS_PER_ROW = Ktraits::CTAS_PER_ROW };
 
     using compute_t = typename Ktraits::compute_t;
+    using input_t = typename Ktraits::input_t;
     using index_t = typename Ktraits::index_t;
     using Ivec = typename Ktraits::Ivec;
     using Ovec = typename Ktraits::Ovec;
@@ -40,7 +41,7 @@ void ln_bwd_kernel(layer_norm::BwdParams params) {
     const index_t r = bidm * Ktraits::ROWS_PER_CTA + warp_m;
     const index_t c = bidn * THREADS_PER_ROW + warp_n * THREADS_PER_WARP + lane;
 
-    static_assert(COLS == THREADS_PER_ROW * LDGS * NUM_ELTS * CTAS_PER_ROW);
+    static_assert(COLS == THREADS_PER_ROW * LDGS * NUM_ELTS * CTAS_PER_ROW, "LDGS needs to be a whole number. See ln_kernel_traits.h and update kernel parameters in REGISTER_BWD_LAUNCHER.");
 
     Cvec dzy_sum[LDGS];
     Cvec dz_sum[LDGS];
@@ -119,7 +120,7 @@ void ln_bwd_kernel(layer_norm::BwdParams params) {
                 compute_t dy_tmp = dy[it * NUM_ELTS + jt];
                 compute_t y_tmp = y[it * NUM_ELTS + jt];
                 compute_t dx_tmp = rs_r * (dy_tmp - (mdyy_local * y_tmp + mdy_local));
-                dx[it].data.elt[jt] = dx_tmp;
+                dx[it].data.elt[jt] = input_t(dx_tmp);
             }
             dx[it].store_to(params.dx, idx);
             idx += Ktraits::VEC_COLS_PER_LDG;
@@ -142,7 +143,7 @@ void ln_bwd_kernel(layer_norm::BwdParams params) {
 
         // Assumption: blockSize divides hidden size.
         enum { NUM_RES = COLS / Ktraits::THREADS_PER_CTA };
-        static_assert(NUM_RES * Ktraits::THREADS_PER_CTA == COLS, "");
+        static_assert(NUM_RES * Ktraits::THREADS_PER_CTA == COLS, "NUM_RES needs to be a whole number. See ln_kernel_traits.h and update kernel parameters in REGISTER_BWD_LAUNCHER.");
 
         idx = warp_m * Ktraits::VEC_COLS + tid_r;
         #pragma unroll
diff --git a/apex/contrib/csrc/layer_norm/ln_bwd_semi_cuda_kernel.cu b/apex/contrib/csrc/layer_norm/ln_bwd_semi_cuda_kernel.cu
index 3893d4e0c..abd78a57f 100644
--- a/apex/contrib/csrc/layer_norm/ln_bwd_semi_cuda_kernel.cu
+++ b/apex/contrib/csrc/layer_norm/ln_bwd_semi_cuda_kernel.cu
@@ -35,8 +35,13 @@ void launch_(LaunchParams<BwdParams> &launch_params, const bool configure_params
 
     if( configure_params ) {
         int ctas_per_sm;
-        cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+#ifdef USE_ROCM
+        hipError_t status_ = hipOccupancyMaxActiveBlocksPerMultiprocessor(
             &ctas_per_sm, kernel, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES);
+#else
+        cudaError_t status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+            &ctas_per_sm, kernel, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES);
+#endif
         launch_params.params.ctas_per_col = launch_params.props->multiProcessorCount * ctas_per_sm / Kernel_traits::CTAS_PER_ROW;
         launch_params.barrier_size = 0;
         launch_params.workspace_bytes = 0;
@@ -52,7 +57,12 @@ void launch_(LaunchParams<BwdParams> &launch_params, const bool configure_params
     }
 
     if( Kernel_traits::SMEM_BYTES >= 48 * 1024 ) {
-        CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES));
+        // hipify missing cudaFuncSetAttribute, cudaFuncAttributeMaxDynamicSharedMemorySize
+#ifdef USE_ROCM
+        CHECK_CUDA(hipFuncSetAttribute((const void *)kernel, hipFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES));
+#else
+        CHECK_CUDA(cudaFuncSetAttribute((const void *)kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES));
+#endif
     }
     auto stream = launch_params.stream;
     auto ctas_per_col = launch_params.params.ctas_per_col;
@@ -60,10 +70,14 @@ void launch_(LaunchParams<BwdParams> &launch_params, const bool configure_params
     if( Kernel_traits::CTAS_PER_ROW == 1 ) {
         kernel<<<ctas_per_col, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES, stream>>>(launch_params.params);
     } else {
-        dim3 grid(Kernel_traits::CTAS_PER_ROW * ctas_per_col);
+	    dim3 grid(Kernel_traits::CTAS_PER_ROW * ctas_per_col);
         dim3 block(Kernel_traits::THREADS_PER_CTA);
         void *params_ = (void *)&launch_params.params;
+#ifdef USE_ROCM
+        hipLaunchCooperativeKernel((void *)kernel, grid, block, (void **)&params_, Kernel_traits::SMEM_BYTES, stream);
+#else
         cudaLaunchCooperativeKernel((void *)kernel, grid, block, (void **)&params_, Kernel_traits::SMEM_BYTES, stream);
+#endif
     }
 
     using Kernel_traits_f = layer_norm::Kernel_traits_finalize<HIDDEN_SIZE,
@@ -79,162 +93,167 @@ void launch_(LaunchParams<BwdParams> &launch_params, const bool configure_params
     kernel_f<<<Kernel_traits_f::CTAS, Kernel_traits_f::THREADS_PER_CTA, 0, stream>>>(launch_params.params);
 }
 
+#ifdef USE_ROCM
+constexpr bool is_rocm = true;
+#else
+constexpr bool is_rocm = false;
+#endif
+
 // Create backward launch function and register. Macro signature:
 //  HIDDEN_SIZE, WTYPE, ITYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
 
-REGISTER_BWD_LAUNCHER(  768, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  768, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  768, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  768, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER(  768, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
-
-REGISTER_BWD_LAUNCHER( 1024, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER( 1024, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER( 1024, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER( 1024, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
-REGISTER_BWD_LAUNCHER( 1024, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
-
-REGISTER_BWD_LAUNCHER( 1536, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 1536, fp16, fp16, fp16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 1536, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 1536, bf16, bf16, bf16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 1536, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
-
-REGISTER_BWD_LAUNCHER( 2048, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 2048, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 2048, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 2048, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 2048, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
-
-REGISTER_BWD_LAUNCHER( 2304, fp32, fp32, fp32, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 2304, fp16, fp16, fp16, fp32, 1, 1, 4,  4, 4);
-REGISTER_BWD_LAUNCHER( 2304, fp16, fp32, fp16, fp32, 1, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER( 2304, bf16, bf16, bf16, fp32, 1, 1, 4,  4, 4);
-REGISTER_BWD_LAUNCHER( 2304, bf16, fp32, bf16, fp32, 1, 1, 4,  8, 4);
-
-REGISTER_BWD_LAUNCHER( 3072, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 3072, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 3072, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 3072, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 3072, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
-
-REGISTER_BWD_LAUNCHER( 3840, fp32, fp32, fp32, fp32, 1, 1, 4, 8, 4);
-REGISTER_BWD_LAUNCHER( 3840, fp16, fp16, fp16, fp32, 1, 1, 4, 4, 4);
-REGISTER_BWD_LAUNCHER( 3840, fp16, fp32, fp16, fp32, 1, 1, 4, 8, 4);
-REGISTER_BWD_LAUNCHER( 3840, bf16, bf16, bf16, fp32, 1, 1, 4, 4, 4);
-REGISTER_BWD_LAUNCHER( 3840, bf16, fp32, bf16, fp32, 1, 1, 4, 8, 4);
-
-REGISTER_BWD_LAUNCHER( 4096, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 4096, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 4096, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 4096, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 4096, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER(  768, fp32, fp32, fp32, fp32, 1, 4, 3, 8, 8);
+REGISTER_BWD_LAUNCHER(  768, fp16, fp16, fp16, fp32, 1, 4, 3, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(  768, fp16, fp32, fp16, fp32, 1, 4, 3, 8, 8);
+REGISTER_BWD_LAUNCHER(  768, bf16, bf16, bf16, fp32, 1, 4, 3, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(  768, bf16, fp32, bf16, fp32, 1, 4, 3, 8, 8);
+
+REGISTER_BWD_LAUNCHER( 1024, fp32, fp32, fp32, fp32, 1, 4, 4, 8, 4);
+REGISTER_BWD_LAUNCHER( 1024, fp16, fp16, fp16, fp32, 1, 4, 4, 4, 4);
+REGISTER_BWD_LAUNCHER( 1024, fp16, fp32, fp16, fp32, 1, 4, 4, 8, 4);
+REGISTER_BWD_LAUNCHER( 1024, bf16, bf16, bf16, fp32, 1, 4, 4, 4, 4);
+REGISTER_BWD_LAUNCHER( 1024, bf16, fp32, bf16, fp32, 1, 4, 4, 8, 4);
+
+REGISTER_BWD_LAUNCHER( 1536, fp32, fp32, fp32, fp32, 1, 4, 3, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER( 1536, fp16, fp16, fp16, fp32, 1, 4, 3, is_rocm ? 4 :  8, 4);
+REGISTER_BWD_LAUNCHER( 1536, fp16, fp32, fp16, fp32, 1, 4, 3, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER( 1536, bf16, bf16, bf16, fp32, 1, 4, 3, is_rocm ? 4 :  8, 4);
+REGISTER_BWD_LAUNCHER( 1536, bf16, fp32, bf16, fp32, 1, 4, 3, is_rocm ? 8 : 16, 4);
+
+REGISTER_BWD_LAUNCHER( 2048, fp32, fp32, fp32, fp32, 1, 4, 4, 8, 4);
+REGISTER_BWD_LAUNCHER( 2048, fp16, fp16, fp16, fp32, 1, 4, 4, 4, 4);
+REGISTER_BWD_LAUNCHER( 2048, fp16, fp32, fp16, fp32, 1, 4, 4, 8, 4);
+REGISTER_BWD_LAUNCHER( 2048, bf16, bf16, bf16, fp32, 1, 4, 4, 4, 4);
+REGISTER_BWD_LAUNCHER( 2048, bf16, fp32, bf16, fp32, 1, 4, 4, 8, 4);
+
+REGISTER_BWD_LAUNCHER( 2304, fp32, fp32, fp32, fp32, 1, 1, 4, is_rocm ? 4 : 8, 4);
+REGISTER_BWD_LAUNCHER( 2304, fp16, fp16, fp16, fp32, 1, 1, 4, is_rocm ? 2 : 4, 4);
+REGISTER_BWD_LAUNCHER( 2304, fp16, fp32, fp16, fp32, 1, 1, 4, is_rocm ? 4 : 8, 4);
+REGISTER_BWD_LAUNCHER( 2304, bf16, bf16, bf16, fp32, 1, 1, 4, is_rocm ? 2 : 4, 4);
+REGISTER_BWD_LAUNCHER( 2304, bf16, fp32, bf16, fp32, 1, 1, 4, is_rocm ? 4 : 8, 4);
+
+REGISTER_BWD_LAUNCHER( 3072, fp32, fp32, fp32, fp32, 1, 1, 4, 8, 4);
+REGISTER_BWD_LAUNCHER( 3072, fp16, fp16, fp16, fp32, 1, 1, 4, is_rocm ? 4 : 16, 4);
+REGISTER_BWD_LAUNCHER( 3072, fp16, fp32, fp16, fp32, 1, 1, 4, 8, 4);
+REGISTER_BWD_LAUNCHER( 3072, bf16, bf16, bf16, fp32, 1, 1, 4, is_rocm ? 4 : 16, 4);
+REGISTER_BWD_LAUNCHER( 3072, bf16, fp32, bf16, fp32, 1, 1, 4, 8, 4);
+
+REGISTER_BWD_LAUNCHER( 3840, fp32, fp32, fp32, fp32, 1, 1, 4, is_rocm ? 4 : 8, 4);
+REGISTER_BWD_LAUNCHER( 3840, fp16, fp16, fp16, fp32, 1, 1, 4, is_rocm ? 2 : 4, 4);
+REGISTER_BWD_LAUNCHER( 3840, fp16, fp32, fp16, fp32, 1, 1, 4, is_rocm ? 4 : 8, 4);
+REGISTER_BWD_LAUNCHER( 3840, bf16, bf16, bf16, fp32, 1, 1, 4, is_rocm ? 2 : 4, 4);
+REGISTER_BWD_LAUNCHER( 3840, bf16, fp32, bf16, fp32, 1, 1, 4, is_rocm ? 4 : 8, 4);
+
+REGISTER_BWD_LAUNCHER( 4096, fp32, fp32, fp32, fp32, 1, 1, 8, 8, 4);
+REGISTER_BWD_LAUNCHER( 4096, fp16, fp16, fp16, fp32, 1, 1, 8, 4, 4);
+REGISTER_BWD_LAUNCHER( 4096, fp16, fp32, fp16, fp32, 1, 1, 8, 8, 4);
+REGISTER_BWD_LAUNCHER( 4096, bf16, bf16, bf16, fp32, 1, 1, 8, 4, 4);
+REGISTER_BWD_LAUNCHER( 4096, bf16, fp32, bf16, fp32, 1, 1, 8, 8, 4);
 
 REGISTER_BWD_LAUNCHER( 5120, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 5120, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 5120, fp16, fp16, fp16, fp32, 1, 1, 4, is_rocm ? 8 : 16, 4);
 REGISTER_BWD_LAUNCHER( 5120, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 5120, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 5120, bf16, bf16, bf16, fp32, 1, 1, 4, is_rocm ? 8 : 16, 4);
 REGISTER_BWD_LAUNCHER( 5120, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
 
-REGISTER_BWD_LAUNCHER( 6144, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 6144, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 6144, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 6144, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER( 6144, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
-
-REGISTER_BWD_LAUNCHER( 8192, fp32, fp32, fp32, fp32, 2, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 8192, fp16, fp16, fp16, fp32, 2, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 8192, fp16, fp32, fp16, fp32, 2, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 8192, bf16, bf16, bf16, fp32, 2, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER( 8192, bf16, fp32, bf16, fp32, 2, 1, 4, 16, 4);
-
-REGISTER_BWD_LAUNCHER(10240, fp32, fp32, fp32, fp32, 2, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(10240, fp16, fp16, fp16, fp32, 2, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(10240, fp16, fp32, fp16, fp32, 2, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(10240, bf16, bf16, bf16, fp32, 2, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(10240, bf16, fp32, bf16, fp32, 2, 1, 4, 16, 4);
-
-REGISTER_BWD_LAUNCHER(12288, fp32, fp32, fp32, fp32, 4, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(12288, fp16, fp16, fp16, fp32, 4, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(12288, fp16, fp32, fp16, fp32, 4, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(12288, bf16, bf16, bf16, fp32, 4, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(12288, bf16, fp32, bf16, fp32, 4, 1, 4, 16, 4);
-
-REGISTER_BWD_LAUNCHER(12800, fp32, fp32, fp32, fp32, 5, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(12800, fp16, fp16, fp16, fp32, 5, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER(12800, fp16, fp32, fp16, fp32, 5, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(12800, bf16, bf16, bf16, fp32, 5, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER(12800, bf16, fp32, bf16, fp32, 5, 1, 4, 16, 4);
-
-REGISTER_BWD_LAUNCHER(14336, fp32, fp32, fp32, fp32, 4, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER(14336, fp16, fp16, fp16, fp32, 4, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER(14336, fp16, fp32, fp16, fp32, 4, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER(14336, bf16, bf16, bf16, fp32, 4, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER(14336, bf16, fp32, bf16, fp32, 4, 1, 4,  8, 4);
-
-REGISTER_BWD_LAUNCHER(15360, fp32, fp32, fp32, fp32, 4, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER(15360, fp16, fp16, fp16, fp32, 4, 1, 4,  4, 4);
-REGISTER_BWD_LAUNCHER(15360, fp16, fp32, fp16, fp32, 4, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER(15360, bf16, bf16, bf16, fp32, 4, 1, 4,  4, 4);
-REGISTER_BWD_LAUNCHER(15360, bf16, fp32, bf16, fp32, 4, 1, 4,  8, 4);
-
-REGISTER_BWD_LAUNCHER(16384, fp32, fp32, fp32, fp32, 4, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(16384, fp16, fp16, fp16, fp32, 4, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(16384, fp16, fp32, fp16, fp32, 4, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(16384, bf16, bf16, bf16, fp32, 4, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(16384, bf16, fp32, bf16, fp32, 4, 1, 4, 16, 4);
-
-REGISTER_BWD_LAUNCHER(18432, fp32, fp32, fp32, fp32, 4, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(18432, fp16, fp16, fp16, fp32, 4, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER(18432, fp16, fp32, fp16, fp32, 4, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(18432, bf16, bf16, bf16, fp32, 4, 1, 4,  8, 4);
-REGISTER_BWD_LAUNCHER(18432, bf16, fp32, bf16, fp32, 4, 1, 4, 16, 4);
-
-REGISTER_BWD_LAUNCHER(20480, fp32, fp32, fp32, fp32, 4, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(20480, fp16, fp16, fp16, fp32, 4, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(20480, fp16, fp32, fp16, fp32, 4, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(20480, bf16, bf16, bf16, fp32, 4, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(20480, bf16, fp32, bf16, fp32, 4, 1, 4, 16, 4);
-
-REGISTER_BWD_LAUNCHER(24576, fp32, fp32, fp32, fp32, 4, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(24576, fp16, fp16, fp16, fp32, 4, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(24576, fp16, fp32, fp16, fp32, 4, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(24576, bf16, bf16, bf16, fp32, 4, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(24576, bf16, fp32, bf16, fp32, 4, 1, 8, 16, 4);
-
-REGISTER_BWD_LAUNCHER(25600, fp32, fp32, fp32, fp32, 5, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(25600, fp16, fp16, fp16, fp32, 5, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(25600, fp16, fp32, fp16, fp32, 5, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(25600, bf16, bf16, bf16, fp32, 5, 1, 4, 16, 4);
-REGISTER_BWD_LAUNCHER(25600, bf16, fp32, bf16, fp32, 5, 1, 4, 16, 4);
-
-REGISTER_BWD_LAUNCHER(30720, fp32, fp32, fp32, fp32, 4, 1, 8, 8, 4);
-REGISTER_BWD_LAUNCHER(30720, fp16, fp16, fp16, fp32, 4, 1, 8, 4, 4);
-REGISTER_BWD_LAUNCHER(30720, fp16, fp32, fp16, fp32, 4, 1, 8, 8, 4);
-REGISTER_BWD_LAUNCHER(30720, bf16, bf16, bf16, fp32, 4, 1, 8, 4, 4);
-REGISTER_BWD_LAUNCHER(30720, bf16, fp32, bf16, fp32, 4, 1, 8, 8, 4);
-
-REGISTER_BWD_LAUNCHER(32768, fp32, fp32, fp32, fp32, 4, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(32768, fp16, fp16, fp16, fp32, 4, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(32768, fp16, fp32, fp16, fp32, 4, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(32768, bf16, bf16, bf16, fp32, 4, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(32768, bf16, fp32, bf16, fp32, 4, 1, 8, 16, 4);
-
-REGISTER_BWD_LAUNCHER(40960, fp32, fp32, fp32, fp32, 4, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(40960, fp16, fp16, fp16, fp32, 4, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(40960, fp16, fp32, fp16, fp32, 4, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(40960, bf16, bf16, bf16, fp32, 4, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(40960, bf16, fp32, bf16, fp32, 4, 1, 8, 16, 4);
-
-REGISTER_BWD_LAUNCHER(49152, fp32, fp32, fp32, fp32, 8, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(49152, fp16, fp16, fp16, fp32, 8, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(49152, fp16, fp32, fp16, fp32, 8, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(49152, bf16, bf16, bf16, fp32, 8, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(49152, bf16, fp32, bf16, fp32, 8, 1, 8, 16, 4);
-
-REGISTER_BWD_LAUNCHER(65536, fp32, fp32, fp32, fp32, 8, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(65536, fp16, fp16, fp16, fp32, 8, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(65536, fp16, fp32, fp16, fp32, 8, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(65536, bf16, bf16, bf16, fp32, 8, 1, 8, 16, 4);
-REGISTER_BWD_LAUNCHER(65536, bf16, fp32, bf16, fp32, 8, 1, 8, 16, 4);
-
+REGISTER_BWD_LAUNCHER( 6144, fp32, fp32, fp32, fp32, 1, 1, 8, 8, 4);
+REGISTER_BWD_LAUNCHER( 6144, fp16, fp16, fp16, fp32, 1, 1, 8, is_rocm ? 4 : 16, 4);
+REGISTER_BWD_LAUNCHER( 6144, fp16, fp32, fp16, fp32, 1, 1, 8, 8, 4);
+REGISTER_BWD_LAUNCHER( 6144, bf16, bf16, bf16, fp32, 1, 1, 8, is_rocm ? 4 : 16, 4);
+REGISTER_BWD_LAUNCHER( 6144, bf16, fp32, bf16, fp32, 1, 1, 8, 8, 4);
+
+REGISTER_BWD_LAUNCHER( 8192, fp32, fp32, fp32, fp32, 1, 1, 8, 8, 4);
+REGISTER_BWD_LAUNCHER( 8192, fp16, fp16, fp16, fp32, 1, 1, 8, 4, 4);
+REGISTER_BWD_LAUNCHER( 8192, fp16, fp32, fp16, fp32, 1, 1, 8, 8, 4);
+REGISTER_BWD_LAUNCHER( 8192, bf16, bf16, bf16, fp32, 1, 1, 8, 4, 4);
+REGISTER_BWD_LAUNCHER( 8192, bf16, fp32, bf16, fp32, 1, 1, 8, 8, 4);
+
+REGISTER_BWD_LAUNCHER(10240, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER(10240, fp16, fp16, fp16, fp32, 1, 1, 4, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(10240, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER(10240, bf16, bf16, bf16, fp32, 1, 1, 4, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(10240, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
+
+REGISTER_BWD_LAUNCHER(12288, fp32, fp32, fp32, fp32, 1, 1, 16, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(12288, fp16, fp16, fp16, fp32, 1, 1, 16, is_rocm ? 4 : 16, 4);
+REGISTER_BWD_LAUNCHER(12288, fp16, fp32, fp16, fp32, 1, 1, 16, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(12288, bf16, bf16, bf16, fp32, 1, 1, 16, is_rocm ? 4 : 16, 4);
+REGISTER_BWD_LAUNCHER(12288, bf16, fp32, bf16, fp32, 1, 1, 16, 8, 4);
+
+REGISTER_BWD_LAUNCHER(12800, fp32, fp32, fp32, fp32, 1, 1, 4, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(12800, fp16, fp16, fp16, fp32, 1, 1, 4, is_rocm ? 4 :  8, 4);
+REGISTER_BWD_LAUNCHER(12800, fp16, fp32, fp16, fp32, 1, 1, 4, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(12800, bf16, bf16, bf16, fp32, 1, 1, 4, is_rocm ? 4 :  8, 4);
+REGISTER_BWD_LAUNCHER(12800, bf16, fp32, bf16, fp32, 1, 1, 4, is_rocm ? 8 : 16, 4);
+
+REGISTER_BWD_LAUNCHER(14336, fp32, fp32, fp32, fp32, 1, 1, 4,  8, 4);
+REGISTER_BWD_LAUNCHER(14336, fp16, fp16, fp16, fp32, 1, 1, 4,  is_rocm ? 4 : 8, 4);
+REGISTER_BWD_LAUNCHER(14336, fp16, fp32, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_BWD_LAUNCHER(14336, bf16, bf16, bf16, fp32, 1, 1, 4,  is_rocm ? 4 : 8, 4);
+REGISTER_BWD_LAUNCHER(14336, bf16, fp32, bf16, fp32, 1, 1, 4,  8, 4);
+
+REGISTER_BWD_LAUNCHER(15360, fp32, fp32, fp32, fp32, 1, 1, 4,  is_rocm ? 4 : 8, 4);
+REGISTER_BWD_LAUNCHER(15360, fp16, fp16, fp16, fp32, 1, 1, 4,  is_rocm ? 2 : 4, 4);
+REGISTER_BWD_LAUNCHER(15360, fp16, fp32, fp16, fp32, 1, 1, 4,  is_rocm ? 4 : 8, 4);
+REGISTER_BWD_LAUNCHER(15360, bf16, bf16, bf16, fp32, 1, 1, 4,  is_rocm ? 2 : 4, 4);
+REGISTER_BWD_LAUNCHER(15360, bf16, fp32, bf16, fp32, 1, 1, 4,  is_rocm ? 4 : 8, 4);
+
+REGISTER_BWD_LAUNCHER(16384, fp32, fp32, fp32, fp32, 1, 1, 16, 8, 4);
+REGISTER_BWD_LAUNCHER(16384, fp16, fp16, fp16, fp32, 1, 1, 16, 4, 4);
+REGISTER_BWD_LAUNCHER(16384, fp16, fp32, fp16, fp32, 1, 1, 16, 8, 4);
+REGISTER_BWD_LAUNCHER(16384, bf16, bf16, bf16, fp32, 1, 1, 16, 4, 4);
+REGISTER_BWD_LAUNCHER(16384, bf16, fp32, bf16, fp32, 1, 1, 16, 8, 4);
+
+REGISTER_BWD_LAUNCHER(18432, fp32, fp32, fp32, fp32, 1, 1, 4, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(18432, fp16, fp16, fp16, fp32, 1, 1, 4, is_rocm ? 4 :  8, 4);
+REGISTER_BWD_LAUNCHER(18432, fp16, fp32, fp16, fp32, 1, 1, 4, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(18432, bf16, bf16, bf16, fp32, 1, 1, 4, is_rocm ? 4 :  8, 4);
+REGISTER_BWD_LAUNCHER(18432, bf16, fp32, bf16, fp32, 1, 1, 4, is_rocm ? 8 : 16, 4);
+
+REGISTER_BWD_LAUNCHER(20480, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER(20480, fp16, fp16, fp16, fp32, 1, 1, 4, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(20480, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER(20480, bf16, bf16, bf16, fp32, 1, 1, 4, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(20480, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
+
+REGISTER_BWD_LAUNCHER(24576, fp32, fp32, fp32, fp32, 1, 1, 8, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(24576, fp16, fp16, fp16, fp32, 1, 1, 8, is_rocm ? 4 : 16, 4);
+REGISTER_BWD_LAUNCHER(24576, fp16, fp32, fp16, fp32, 1, 1, 8, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(24576, bf16, bf16, bf16, fp32, 1, 1, 8, is_rocm ? 4 : 16, 4);
+REGISTER_BWD_LAUNCHER(24576, bf16, fp32, bf16, fp32, 1, 1, 8, 8, 4);
+
+REGISTER_BWD_LAUNCHER(25600, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER(25600, fp16, fp16, fp16, fp32, 1, 1, 4, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(25600, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER(25600, bf16, bf16, bf16, fp32, 1, 1, 4, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(25600, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
+
+REGISTER_BWD_LAUNCHER(30720, fp32, fp32, fp32, fp32, 1, 1, 8, is_rocm ? 4 : 8, 4);
+REGISTER_BWD_LAUNCHER(30720, fp16, fp16, fp16, fp32, 1, 1, 8, is_rocm ? 2 : 4, 4);
+REGISTER_BWD_LAUNCHER(30720, fp16, fp32, fp16, fp32, 1, 1, 8, is_rocm ? 4 : 8, 4);
+REGISTER_BWD_LAUNCHER(30720, bf16, bf16, bf16, fp32, 1, 1, 8, is_rocm ? 2 : 4, 4);
+REGISTER_BWD_LAUNCHER(30720, bf16, fp32, bf16, fp32, 1, 1, 8, is_rocm ? 4 : 8, 4);
+
+REGISTER_BWD_LAUNCHER(32768, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER(32768, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER(32768, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER(32768, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER(32768, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
+
+REGISTER_BWD_LAUNCHER(40960, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER(40960, fp16, fp16, fp16, fp32, 1, 1, 8, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(40960, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER(40960, bf16, bf16, bf16, fp32, 1, 1, 8, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(40960, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
+
+REGISTER_BWD_LAUNCHER(49152, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER(49152, fp16, fp16, fp16, fp32, 1, 1, 8, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(49152, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER(49152, bf16, bf16, bf16, fp32, 1, 1, 8, is_rocm ? 8 : 16, 4);
+REGISTER_BWD_LAUNCHER(49152, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
+
+REGISTER_BWD_LAUNCHER(65536, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER(65536, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER(65536, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER(65536, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER(65536, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
\ No newline at end of file
diff --git a/apex/contrib/csrc/layer_norm/ln_fwd_cuda_kernel.cu b/apex/contrib/csrc/layer_norm/ln_fwd_cuda_kernel.cu
index 62ff78ee3..6efb8e17d 100644
--- a/apex/contrib/csrc/layer_norm/ln_fwd_cuda_kernel.cu
+++ b/apex/contrib/csrc/layer_norm/ln_fwd_cuda_kernel.cu
@@ -34,7 +34,7 @@ void launch_(LaunchParams<FwdParams> &launch_params, const bool configure_params
 
     if( configure_params ) {
         int ctas_per_sm;
-        cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        cudaError_t status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
             &ctas_per_sm, kernel, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES_FWD);
         launch_params.params.ctas_per_col = launch_params.props->multiProcessorCount * ctas_per_sm / Kernel_traits::CTAS_PER_ROW;
         launch_params.barrier_size = 0;
@@ -51,7 +51,12 @@ void launch_(LaunchParams<FwdParams> &launch_params, const bool configure_params
     }
 
     if( Kernel_traits::SMEM_BYTES_FWD >= 48 * 1024 ) {
-        CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES_FWD));
+        // hipify missing cudaFuncSetAttribute, cudaFuncAttributeMaxDynamicSharedMemorySize
+#ifdef USE_ROCM
+        CHECK_CUDA(hipFuncSetAttribute((const void *)kernel, hipFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES_FWD));
+#else
+        CHECK_CUDA(cudaFuncSetAttribute((const void *)kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES_FWD));
+#endif
     }
     auto stream = launch_params.stream;
     auto ctas_per_col = launch_params.params.ctas_per_col;
@@ -59,19 +64,35 @@ void launch_(LaunchParams<FwdParams> &launch_params, const bool configure_params
     if( Kernel_traits::CTAS_PER_ROW == 1 ) {
         kernel<<<ctas_per_col, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES_FWD, stream>>>(launch_params.params);
     } else {
-        dim3 grid(Kernel_traits::CTAS_PER_ROW * ctas_per_col);
+	    dim3 grid(Kernel_traits::CTAS_PER_ROW * ctas_per_col);
         dim3 block(Kernel_traits::THREADS_PER_CTA);
         void *params_ = (void *)&launch_params.params;
+#ifdef USE_ROCM
+        hipLaunchCooperativeKernel((void *)kernel, grid, block, (void **)&params_, Kernel_traits::SMEM_BYTES_FWD, stream);
+#else
         cudaLaunchCooperativeKernel((void *)kernel, grid, block, (void **)&params_, Kernel_traits::SMEM_BYTES_FWD, stream);
+#endif
     }
 
 }
 
+#ifdef USE_ROCM
+constexpr bool is_rocm = true;
+#else
+constexpr bool is_rocm = false;
+#endif
+
+//  HIDDEN_SIZE, WTYPE, ITYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+REGISTER_FWD_LAUNCHER(  512, fp32, fp32, fp32, fp32, 1, 4, 2, 4);
+REGISTER_FWD_LAUNCHER(  512, fp16, fp16, fp16, fp32, 1, 4, 2, is_rocm ? 4 : 4);
+REGISTER_FWD_LAUNCHER(  512, fp16, fp32, fp16, fp32, 1, 4, 2, 4);
+REGISTER_FWD_LAUNCHER(  512, bf16, bf16, bf16, fp32, 1, 4, 2, is_rocm ? 4 : 4);
+REGISTER_FWD_LAUNCHER(  512, bf16, fp32, bf16, fp32, 1, 4, 2, 4);
 
 REGISTER_FWD_LAUNCHER(  768, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
-REGISTER_FWD_LAUNCHER(  768, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  768, fp16, fp16, fp16, fp32, 1, 4, 1, is_rocm ? 8 : 16);
 REGISTER_FWD_LAUNCHER(  768, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
-REGISTER_FWD_LAUNCHER(  768, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  768, bf16, bf16, bf16, fp32, 1, 4, 1, is_rocm ? 8 : 16);
 REGISTER_FWD_LAUNCHER(  768, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
 
 REGISTER_FWD_LAUNCHER( 1024, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
@@ -93,21 +114,21 @@ REGISTER_FWD_LAUNCHER( 2048, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
 REGISTER_FWD_LAUNCHER( 2048, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
 
 REGISTER_FWD_LAUNCHER( 2304, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
-REGISTER_FWD_LAUNCHER( 2304, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2304, fp16, fp16, fp16, fp32, 1, 4, 1, is_rocm ? 8 : 16);
 REGISTER_FWD_LAUNCHER( 2304, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
-REGISTER_FWD_LAUNCHER( 2304, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2304, bf16, bf16, bf16, fp32, 1, 4, 1, is_rocm ? 8 : 16);
 REGISTER_FWD_LAUNCHER( 2304, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
 
 REGISTER_FWD_LAUNCHER( 3072, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
-REGISTER_FWD_LAUNCHER( 3072, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 3072, fp16, fp16, fp16, fp32, 1, 1, 4, is_rocm ? 8 : 16);
 REGISTER_FWD_LAUNCHER( 3072, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
-REGISTER_FWD_LAUNCHER( 3072, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 3072, bf16, bf16, bf16, fp32, 1, 1, 4, is_rocm ? 8 : 16);
 REGISTER_FWD_LAUNCHER( 3072, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
 
 REGISTER_FWD_LAUNCHER( 3840, fp32, fp32, fp32, fp32, 1, 1, 4,  4);
-REGISTER_FWD_LAUNCHER( 3840, fp16, fp16, fp16, fp32, 1, 1, 4,  4);
+REGISTER_FWD_LAUNCHER( 3840, fp16, fp16, fp16, fp32, 1, 1, 4,  is_rocm ? 2 : 4);
 REGISTER_FWD_LAUNCHER( 3840, fp16, fp32, fp16, fp32, 1, 1, 4,  4);
-REGISTER_FWD_LAUNCHER( 3840, bf16, bf16, bf16, fp32, 1, 1, 4,  4);
+REGISTER_FWD_LAUNCHER( 3840, bf16, bf16, bf16, fp32, 1, 1, 4,  is_rocm ? 2 : 4);
 REGISTER_FWD_LAUNCHER( 3840, bf16, fp32, bf16, fp32, 1, 1, 4,  4);
 
 REGISTER_FWD_LAUNCHER( 4096, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
@@ -117,9 +138,9 @@ REGISTER_FWD_LAUNCHER( 4096, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
 REGISTER_FWD_LAUNCHER( 4096, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
 
 REGISTER_FWD_LAUNCHER( 5120, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
-REGISTER_FWD_LAUNCHER( 5120, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 5120, fp16, fp16, fp16, fp32, 1, 1, 4, is_rocm ? 8 : 16);
 REGISTER_FWD_LAUNCHER( 5120, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
-REGISTER_FWD_LAUNCHER( 5120, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 5120, bf16, bf16, bf16, fp32, 1, 1, 4, is_rocm ? 8 : 16);
 REGISTER_FWD_LAUNCHER( 5120, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
 
 REGISTER_FWD_LAUNCHER( 6144, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
@@ -134,92 +155,181 @@ REGISTER_FWD_LAUNCHER( 8192, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
 REGISTER_FWD_LAUNCHER( 8192, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
 REGISTER_FWD_LAUNCHER( 8192, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
 
-REGISTER_FWD_LAUNCHER(10240, fp32, fp32, fp32, fp32, 2, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(10240, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
 REGISTER_FWD_LAUNCHER(10240, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
 REGISTER_FWD_LAUNCHER(10240, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
 REGISTER_FWD_LAUNCHER(10240, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
 REGISTER_FWD_LAUNCHER(10240, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
 
-REGISTER_FWD_LAUNCHER(12288, fp32, fp32, fp32, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(12288, fp16, fp16, fp16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(12288, fp16, fp32, fp16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(12288, bf16, bf16, bf16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(12288, bf16, fp32, bf16, fp32, 2, 1, 4, 16);
-
-REGISTER_FWD_LAUNCHER(12800, fp32, fp32, fp32, fp32, 2, 1, 4,  4);
-REGISTER_FWD_LAUNCHER(12800, fp16, fp16, fp16, fp32, 2, 1, 4,  4);
-REGISTER_FWD_LAUNCHER(12800, fp16, fp32, fp16, fp32, 2, 1, 4,  4);
-REGISTER_FWD_LAUNCHER(12800, bf16, bf16, bf16, fp32, 2, 1, 4,  4);
-REGISTER_FWD_LAUNCHER(12800, bf16, fp32, bf16, fp32, 2, 1, 4,  4);
-
-REGISTER_FWD_LAUNCHER(14336, fp32, fp32, fp32, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(14336, fp16, fp16, fp16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(14336, fp16, fp32, fp16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(14336, bf16, bf16, bf16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(14336, bf16, fp32, bf16, fp32, 2, 1, 4,  8);
-
-REGISTER_FWD_LAUNCHER(15360, fp32, fp32, fp32, fp32, 2, 1, 4,  8);
-REGISTER_FWD_LAUNCHER(15360, fp16, fp16, fp16, fp32, 2, 1, 4,  8);
-REGISTER_FWD_LAUNCHER(15360, fp16, fp32, fp16, fp32, 2, 1, 4,  8);
-REGISTER_FWD_LAUNCHER(15360, bf16, bf16, bf16, fp32, 2, 1, 4,  8);
-REGISTER_FWD_LAUNCHER(15360, bf16, fp32, bf16, fp32, 2, 1, 4,  8);
-
-REGISTER_FWD_LAUNCHER(16384, fp32, fp32, fp32, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(16384, fp16, fp16, fp16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(16384, bf16, bf16, bf16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(16384, bf16, fp32, bf16, fp32, 2, 1, 4, 16);
-
-REGISTER_FWD_LAUNCHER(18432, fp32, fp32, fp32, fp32, 4, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(18432, fp16, fp16, fp16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(18432, fp16, fp32, fp16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(18432, bf16, bf16, bf16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(18432, bf16, fp32, bf16, fp32, 4, 1, 4, 16);
-
-REGISTER_FWD_LAUNCHER(20480, fp32, fp32, fp32, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(20480, fp16, fp16, fp16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(20480, fp16, fp32, fp16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(20480, bf16, bf16, bf16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(20480, bf16, fp32, bf16, fp32, 2, 1, 4, 16);
-
-REGISTER_FWD_LAUNCHER(24576, fp32, fp32, fp32, fp32, 4, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(24576, fp16, fp16, fp16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(24576, fp16, fp32, fp16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(24576, bf16, bf16, bf16, fp32, 2, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(24576, bf16, fp32, bf16, fp32, 2, 1, 4, 16);
-
-REGISTER_FWD_LAUNCHER(25600, fp32, fp32, fp32, fp32, 4, 1, 4,  4);
-REGISTER_FWD_LAUNCHER(25600, fp16, fp16, fp16, fp32, 2, 1, 4,  8);
-REGISTER_FWD_LAUNCHER(25600, fp16, fp32, fp16, fp32, 4, 1, 4,  4);
-REGISTER_FWD_LAUNCHER(25600, bf16, bf16, bf16, fp32, 2, 1, 4,  8);
-REGISTER_FWD_LAUNCHER(25600, bf16, fp32, bf16, fp32, 4, 1, 4,  4);
-
-REGISTER_FWD_LAUNCHER(30720, fp32, fp32, fp32, fp32, 4, 1, 4,  4);
-REGISTER_FWD_LAUNCHER(30720, fp16, fp16, fp16, fp32, 4, 1, 4,  4);
-REGISTER_FWD_LAUNCHER(30720, fp16, fp32, fp16, fp32, 4, 1, 4,  4);
-REGISTER_FWD_LAUNCHER(30720, bf16, bf16, bf16, fp32, 4, 1, 4,  4);
-REGISTER_FWD_LAUNCHER(30720, bf16, fp32, bf16, fp32, 4, 1, 4,  4);
-
-REGISTER_FWD_LAUNCHER(32768, fp32, fp32, fp32, fp32, 4, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(32768, fp16, fp16, fp16, fp32, 4, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(32768, fp16, fp32, fp16, fp32, 4, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(32768, bf16, bf16, bf16, fp32, 4, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(32768, bf16, fp32, bf16, fp32, 4, 1, 4, 16);
-
-REGISTER_FWD_LAUNCHER(40960, fp32, fp32, fp32, fp32, 4, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(40960, fp16, fp16, fp16, fp32, 4, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(40960, fp16, fp32, fp16, fp32, 4, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(40960, bf16, bf16, bf16, fp32, 4, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(40960, bf16, fp32, bf16, fp32, 4, 1, 4, 16);
-
-REGISTER_FWD_LAUNCHER(49152, fp32, fp32, fp32, fp32, 8, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(49152, fp16, fp16, fp16, fp32, 4, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(49152, fp16, fp32, fp16, fp32, 4, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(49152, bf16, bf16, bf16, fp32, 4, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(49152, bf16, fp32, bf16, fp32, 4, 1, 4, 16);
-
-REGISTER_FWD_LAUNCHER(65536, fp32, fp32, fp32, fp32, 8, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(65536, fp16, fp16, fp16, fp32, 8, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(65536, fp16, fp32, fp16, fp32, 8, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(65536, bf16, bf16, bf16, fp32, 8, 1, 4, 16);
-REGISTER_FWD_LAUNCHER(65536, bf16, fp32, bf16, fp32, 8, 1, 4, 16);
-
+REGISTER_FWD_LAUNCHER(12288, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(12288, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(12288, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(12288, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(12288, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+
+REGISTER_FWD_LAUNCHER(12800, fp32, fp32, fp32, fp32, 1, 1, 4,  4);
+REGISTER_FWD_LAUNCHER(12800, fp16, fp16, fp16, fp32, 1, 1, 4,  is_rocm ? 2 : 4);
+REGISTER_FWD_LAUNCHER(12800, fp16, fp32, fp16, fp32, 1, 1, 4,  4);
+REGISTER_FWD_LAUNCHER(12800, bf16, bf16, bf16, fp32, 1, 1, 4,  is_rocm ? 2 : 4);
+REGISTER_FWD_LAUNCHER(12800, bf16, fp32, bf16, fp32, 1, 1, 4,  4);
+
+REGISTER_FWD_LAUNCHER(14336, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(14336, fp16, fp16, fp16, fp32, 1, 1, 4, is_rocm ? 8 : 16);
+REGISTER_FWD_LAUNCHER(14336, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(14336, bf16, bf16, bf16, fp32, 1, 1, 4, is_rocm ? 8 : 16);
+REGISTER_FWD_LAUNCHER(14336, bf16, fp32, bf16, fp32, 1, 1, 4,  8);
+
+REGISTER_FWD_LAUNCHER(15360, fp32, fp32, fp32, fp32, 1, 1, 4,  8);
+REGISTER_FWD_LAUNCHER(15360, fp16, fp16, fp16, fp32, 1, 1, 4,  is_rocm ? 4 : 8);
+REGISTER_FWD_LAUNCHER(15360, fp16, fp32, fp16, fp32, 1, 1, 4,  8);
+REGISTER_FWD_LAUNCHER(15360, bf16, bf16, bf16, fp32, 1, 1, 4,  is_rocm ? 4 : 8);
+REGISTER_FWD_LAUNCHER(15360, bf16, fp32, bf16, fp32, 1, 1, 4,  8);
+
+REGISTER_FWD_LAUNCHER(16384, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(16384, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(16384, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(16384, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(16384, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+
+REGISTER_FWD_LAUNCHER(18432, fp32, fp32, fp32, fp32, 1, 1, 4, is_rocm ? 8 : 16);
+REGISTER_FWD_LAUNCHER(18432, fp16, fp16, fp16, fp32, 1, 1, 4, is_rocm ? 8 : 16);
+REGISTER_FWD_LAUNCHER(18432, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(18432, bf16, bf16, bf16, fp32, 1, 1, 4, is_rocm ? 8 : 16);
+REGISTER_FWD_LAUNCHER(18432, bf16, fp32, bf16, fp32, 1, 1, 4, is_rocm ? 8 : 16);
+
+REGISTER_FWD_LAUNCHER(20480, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(20480, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(20480, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(20480, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(20480, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+
+REGISTER_FWD_LAUNCHER(24576, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(24576, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(24576, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(24576, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(24576, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+
+REGISTER_FWD_LAUNCHER(25600, fp32, fp32, fp32, fp32, 1, 1, 4,  4);
+REGISTER_FWD_LAUNCHER(25600, fp16, fp16, fp16, fp32, 1, 1, 4,  is_rocm ? 4 : 8);
+REGISTER_FWD_LAUNCHER(25600, fp16, fp32, fp16, fp32, 1, 1, 4,  4);
+REGISTER_FWD_LAUNCHER(25600, bf16, bf16, bf16, fp32, 1, 1, 4,  is_rocm ? 4 : 8);
+REGISTER_FWD_LAUNCHER(25600, bf16, fp32, bf16, fp32, 1, 1, 4,  4);
+
+REGISTER_FWD_LAUNCHER(30720, fp32, fp32, fp32, fp32, 1, 1, 4,  4);
+REGISTER_FWD_LAUNCHER(30720, fp16, fp16, fp16, fp32, 1, 1, 4,  4);
+REGISTER_FWD_LAUNCHER(30720, fp16, fp32, fp16, fp32, 1, 1, 4,  4);
+REGISTER_FWD_LAUNCHER(30720, bf16, bf16, bf16, fp32, 1, 1, 4,  4);
+REGISTER_FWD_LAUNCHER(30720, bf16, fp32, bf16, fp32, 1, 1, 4,  4);
+
+REGISTER_FWD_LAUNCHER(32768, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(32768, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(32768, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(32768, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(32768, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+
+REGISTER_FWD_LAUNCHER(40960, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(40960, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(40960, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(40960, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(40960, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+
+REGISTER_FWD_LAUNCHER(49152, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(49152, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(49152, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(49152, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(49152, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+
+REGISTER_FWD_LAUNCHER(65536, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(65536, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(65536, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(65536, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER(65536, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+
+// REGISTER_FWD_LAUNCHER(10240, fp32, fp32, fp32, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(10240, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(10240, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(10240, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(10240, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+
+// REGISTER_FWD_LAUNCHER(12288, fp32, fp32, fp32, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(12288, fp16, fp16, fp16, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(12288, fp16, fp32, fp16, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(12288, bf16, bf16, bf16, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(12288, bf16, fp32, bf16, fp32, 2, 1, 4, 16);
+
+// REGISTER_FWD_LAUNCHER(12800, fp32, fp32, fp32, fp32, 2, 1, 4,  4);
+// REGISTER_FWD_LAUNCHER(12800, fp16, fp16, fp16, fp32, 2, 1, 4,  is_rocm ? 2 : 4);
+// REGISTER_FWD_LAUNCHER(12800, fp16, fp32, fp16, fp32, 2, 1, 4,  4);
+// REGISTER_FWD_LAUNCHER(12800, bf16, bf16, bf16, fp32, 2, 1, 4,  is_rocm ? 2 : 4);
+// REGISTER_FWD_LAUNCHER(12800, bf16, fp32, bf16, fp32, 2, 1, 4,  4);
+
+// REGISTER_FWD_LAUNCHER(14336, fp32, fp32, fp32, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(14336, fp16, fp16, fp16, fp32, 2, 1, 4, is_rocm ? 8 : 16);
+// REGISTER_FWD_LAUNCHER(14336, fp16, fp32, fp16, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(14336, bf16, bf16, bf16, fp32, 2, 1, 4, is_rocm ? 8 : 16);
+// REGISTER_FWD_LAUNCHER(14336, bf16, fp32, bf16, fp32, 2, 1, 4,  8);
+
+// REGISTER_FWD_LAUNCHER(15360, fp32, fp32, fp32, fp32, 2, 1, 4,  8);
+// REGISTER_FWD_LAUNCHER(15360, fp16, fp16, fp16, fp32, 2, 1, 4,  is_rocm ? 4 : 8);
+// REGISTER_FWD_LAUNCHER(15360, fp16, fp32, fp16, fp32, 2, 1, 4,  8);
+// REGISTER_FWD_LAUNCHER(15360, bf16, bf16, bf16, fp32, 2, 1, 4,  is_rocm ? 4 : 8);
+// REGISTER_FWD_LAUNCHER(15360, bf16, fp32, bf16, fp32, 2, 1, 4,  8);
+
+// REGISTER_FWD_LAUNCHER(16384, fp32, fp32, fp32, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(16384, fp16, fp16, fp16, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(16384, bf16, bf16, bf16, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(16384, bf16, fp32, bf16, fp32, 2, 1, 4, 16);
+
+// REGISTER_FWD_LAUNCHER(18432, fp32, fp32, fp32, fp32, 4, 1, 4, is_rocm ? 8 : 16);
+// REGISTER_FWD_LAUNCHER(18432, fp16, fp16, fp16, fp32, 2, 1, 4, is_rocm ? 8 : 16);
+// REGISTER_FWD_LAUNCHER(18432, fp16, fp32, fp16, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(18432, bf16, bf16, bf16, fp32, 2, 1, 4, is_rocm ? 8 : 16);
+// REGISTER_FWD_LAUNCHER(18432, bf16, fp32, bf16, fp32, 4, 1, 4, is_rocm ? 8 : 16);
+
+// REGISTER_FWD_LAUNCHER(20480, fp32, fp32, fp32, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(20480, fp16, fp16, fp16, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(20480, fp16, fp32, fp16, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(20480, bf16, bf16, bf16, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(20480, bf16, fp32, bf16, fp32, 2, 1, 4, 16);
+
+// REGISTER_FWD_LAUNCHER(24576, fp32, fp32, fp32, fp32, 4, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(24576, fp16, fp16, fp16, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(24576, fp16, fp32, fp16, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(24576, bf16, bf16, bf16, fp32, 2, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(24576, bf16, fp32, bf16, fp32, 2, 1, 4, 16);
+
+// REGISTER_FWD_LAUNCHER(25600, fp32, fp32, fp32, fp32, 4, 1, 4,  4);
+// REGISTER_FWD_LAUNCHER(25600, fp16, fp16, fp16, fp32, 2, 1, 4,  is_rocm ? 4 : 8);
+// REGISTER_FWD_LAUNCHER(25600, fp16, fp32, fp16, fp32, 4, 1, 4,  4);
+// REGISTER_FWD_LAUNCHER(25600, bf16, bf16, bf16, fp32, 2, 1, 4,  is_rocm ? 4 : 8);
+// REGISTER_FWD_LAUNCHER(25600, bf16, fp32, bf16, fp32, 4, 1, 4,  4);
+
+// REGISTER_FWD_LAUNCHER(30720, fp32, fp32, fp32, fp32, 4, 1, 4,  4);
+// REGISTER_FWD_LAUNCHER(30720, fp16, fp16, fp16, fp32, 4, 1, 4,  4);
+// REGISTER_FWD_LAUNCHER(30720, fp16, fp32, fp16, fp32, 4, 1, 4,  4);
+// REGISTER_FWD_LAUNCHER(30720, bf16, bf16, bf16, fp32, 4, 1, 4,  4);
+// REGISTER_FWD_LAUNCHER(30720, bf16, fp32, bf16, fp32, 4, 1, 4,  4);
+
+// REGISTER_FWD_LAUNCHER(32768, fp32, fp32, fp32, fp32, 4, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(32768, fp16, fp16, fp16, fp32, 4, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(32768, fp16, fp32, fp16, fp32, 4, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(32768, bf16, bf16, bf16, fp32, 4, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(32768, bf16, fp32, bf16, fp32, 4, 1, 4, 16);
+
+// REGISTER_FWD_LAUNCHER(40960, fp32, fp32, fp32, fp32, 4, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(40960, fp16, fp16, fp16, fp32, 4, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(40960, fp16, fp32, fp16, fp32, 4, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(40960, bf16, bf16, bf16, fp32, 4, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(40960, bf16, fp32, bf16, fp32, 4, 1, 4, 16);
+
+// REGISTER_FWD_LAUNCHER(49152, fp32, fp32, fp32, fp32, 8, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(49152, fp16, fp16, fp16, fp32, 4, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(49152, fp16, fp32, fp16, fp32, 4, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(49152, bf16, bf16, bf16, fp32, 4, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(49152, bf16, fp32, bf16, fp32, 4, 1, 4, 16);
+
+// REGISTER_FWD_LAUNCHER(65536, fp32, fp32, fp32, fp32, 8, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(65536, fp16, fp16, fp16, fp32, 8, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(65536, fp16, fp32, fp16, fp32, 8, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(65536, bf16, bf16, bf16, fp32, 8, 1, 4, 16);
+// REGISTER_FWD_LAUNCHER(65536, bf16, fp32, bf16, fp32, 8, 1, 4, 16);
\ No newline at end of file
diff --git a/apex/contrib/csrc/layer_norm/ln_kernel_traits.h b/apex/contrib/csrc/layer_norm/ln_kernel_traits.h
index ed745c5ee..bdd8d32ca 100644
--- a/apex/contrib/csrc/layer_norm/ln_kernel_traits.h
+++ b/apex/contrib/csrc/layer_norm/ln_kernel_traits.h
@@ -22,7 +22,11 @@ struct Kernel_traits_base {
 
     enum { HIDDEN_SIZE = HIDDEN_SIZE_ };
     enum { THREADS_PER_CTA = THREADS_PER_CTA_ };
+#ifdef USE_ROCM
+    enum { THREADS_PER_WARP = 64 };
+#else
     enum { THREADS_PER_WARP = 32 };
+#endif
 
 };
 
@@ -47,7 +51,7 @@ template<
 >
 struct Kernel_traits_finalize : public Base {
     enum { ROWS_PER_CTA = Base::THREADS_PER_CTA / Base::THREADS_PER_WARP };
-    static_assert((int) ROWS_PER_CTA <= (int) Base::THREADS_PER_WARP);
+    static_assert((int) ROWS_PER_CTA <= (int) Base::THREADS_PER_WARP, "ROWS_PER_CTA (WARP_M) needs to be less than warpsize.");
     // Bytes per global load from the input. 
     enum { BYTES_PER_LDG = BYTES_PER_LDG_ };
     // Number of elements fetched by a global load.
@@ -58,7 +62,7 @@ struct Kernel_traits_finalize : public Base {
     static_assert(Base::THREADS_PER_CTA == ROWS_PER_CTA * Base::THREADS_PER_WARP, "We assume one warp per row!");
     // The total number of BYTES_PER_LDG-wide words in a hidden vector.
     enum { COLS = HIDDEN_SIZE_ * sizeof(compute_t_) / BYTES_PER_LDG };
-    static_assert(COLS * BYTES_PER_LDG == HIDDEN_SIZE_ * sizeof(compute_t_));
+    static_assert(COLS * BYTES_PER_LDG == HIDDEN_SIZE_ * sizeof(compute_t_), "HIDDEN_SIZE needs to be a multiple of elements per ldg (BYTES_PER_LDG / sizeof(ctype)) in REGISTER_BWD_LAUNCHER.");
 
     // Shared memory size to transpose the CTA result.
     enum { SMEM_BYTES_TRANSPOSE = Base::THREADS_PER_CTA * BYTES_PER_LDG };
@@ -71,7 +75,7 @@ struct Kernel_traits_finalize : public Base {
     using Reducer = layer_norm::Reducer<compute_t_, 1, 1, 1>;
 
     // Condition for the whole CTA to participate in syncthreads.
-    static_assert(COLS % Base::THREADS_PER_WARP == 0);
+    static_assert(COLS % Base::THREADS_PER_WARP == 0, "HIDDEN_SIZE needs to be a multiple of BYTES_PER_LDG * warpSize in REGISTER_BWD_LAUNCHER");
     enum { CTAS = COLS / Base::THREADS_PER_WARP };
 }; 
 
@@ -84,10 +88,10 @@ template<
     typename output_t_,
     typename compute_t_,
     typename index_t_,
-    uint32_t HIDDEN_SIZE_, 
-    uint32_t CTAS_PER_ROW_, 
-    uint32_t WARPS_M_, 
-    uint32_t WARPS_N_, 
+    uint32_t HIDDEN_SIZE_,
+    uint32_t CTAS_PER_ROW_,
+    uint32_t WARPS_M_,
+    uint32_t WARPS_N_,
     uint32_t BYTES_PER_LDG_ = 16,
     typename Base = Kernel_traits_base<
         HIDDEN_SIZE_,
@@ -123,7 +127,7 @@ struct Kernel_traits : public Base {
     enum { BYTES_PER_ROW_PER_CTA = THREADS_PER_ROW * BYTES_PER_LDG };
     // Multi-row per CTA not supported for multi-CTA => no smem for WGRAD needed
     enum { SMEM_BYTES_WGRAD = CTAS_PER_ROW > 1 ? 0 : ROWS_PER_CTA * COLS * sizeof(compute_t) };
-    static_assert(WARPS_M == 1 || CTAS_PER_ROW == 1);
+    static_assert(WARPS_M == 1 || CTAS_PER_ROW == 1, "Multiple rows per CTA not supported for Multi-CTA.");
 
     using reduce_t = typename layer_norm::TypeToVec2<compute_t>::Type;
     using Reducer = layer_norm::Reducer<reduce_t, CTAS_PER_ROW, WARPS_M, WARPS_N>; 
@@ -135,18 +139,19 @@ struct Kernel_traits : public Base {
     using Ovec = layer_norm::Vec<output_t, NUM_ELTS>;
     using Wvec = layer_norm::Vec<weight_t, NUM_ELTS>;
     using Cvec = layer_norm::Vec<compute_t, NUM_ELTS>;
+    //using Tvec = layer_norm::Vec<int64, 1>;
     enum { ELTS_PER_LDG = BYTES_PER_LDG / sizeof(input_t) };
 
     // Assume that each thread can handle the same number of elements in the output and weights as in the input.
-    static_assert(sizeof(input_t) >= sizeof(output_t));
-    static_assert(sizeof(input_t) >= sizeof(weight_t));
+    static_assert(sizeof(input_t) >= sizeof(output_t), "input_t size should be >= output_t size.");
+    static_assert(sizeof(input_t) >= sizeof(weight_t), "input_t size should be >= weight_t size.");
     // The number of columns fetched per load from input: one per thread.
     enum { VEC_COLS_PER_LDG =  CTAS_PER_ROW * THREADS_PER_ROW };
     // The total number of vectorized loads/stores per hidden vector.
     enum { VEC_COLS = COLS / ELTS_PER_LDG };
     // The number of loads per thread for the input.
     enum { LDGS = VEC_COLS / VEC_COLS_PER_LDG };
-    static_assert(LDGS * VEC_COLS_PER_LDG  == VEC_COLS);
+    static_assert(LDGS * VEC_COLS_PER_LDG  == VEC_COLS, "LDGS needs to be a whole number. Check ln_kernel_trails.h for LDGS definition and update kernel params in REGISTER_BWD_LAUNCHER.");
     //static_assert(LDGS * BYTES_PER_ROW_PER_CTA * CTAS_PER_ROW == BYTES_PER_ROW, "");
 
     using Stats = layer_norm::Stats<compute_t, CTAS_PER_ROW, WARPS_M, WARPS_N>;
diff --git a/apex/contrib/csrc/layer_norm/ln_utils.cuh b/apex/contrib/csrc/layer_norm/ln_utils.cuh
index e18d36de7..609f2693b 100644
--- a/apex/contrib/csrc/layer_norm/ln_utils.cuh
+++ b/apex/contrib/csrc/layer_norm/ln_utils.cuh
@@ -2,14 +2,23 @@
 
 #include <cassert>
 
+#ifdef USE_ROCM
+#include <hip/hip_bfloat16.h>
+#include "hip_bfloat162.h"
+#else
 #include <cuda_bf16.h>
+#endif
 #include <cuda_fp16.h>
 
 #include "ln.h"
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+#ifdef USE_ROCM
+constexpr uint32_t THREADS_PER_WARP = 64;
+#else
 constexpr uint32_t THREADS_PER_WARP = 32;
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -88,7 +97,11 @@ struct Sum {
 
 template<typename T>
 inline __device__ T warp_shuffle_xor(const T & x, uint32_t idx){
+#ifdef USE_ROCM
+    return __shfl_xor(x, idx);
+#else
     return __shfl_xor_sync(uint32_t(-1), x, idx);
+#endif
 }
 
 template<>
@@ -98,7 +111,11 @@ inline __device__ float2 warp_shuffle_xor<float2>(const float2 & x, uint32_t idx
 
 template<typename T>
 inline __device__ T warp_shuffle_down(const T & x, uint32_t idx){
+#ifdef USE_ROCM
+    return __shfl_down(x, idx);
+#else
     return __shfl_down_sync(uint32_t(-1), x, idx);
+#endif
 }
 
 template<>
@@ -134,43 +151,43 @@ struct BytesToType {};
 template<>
 struct BytesToType<64> {
     using Type = uint16;
-    static_assert(sizeof(Type) == 64);
+    static_assert(sizeof(Type) == 64, "is sizeof(uint16) = 64?");
 };
 
 template<>
 struct BytesToType<32> {
     using Type = uint8;
-    static_assert(sizeof(Type) == 32);
+    static_assert(sizeof(Type) == 32, "is sizeof(uint8) = 32?");
 };
 
 template<>
 struct BytesToType<16> {
     using Type = uint4;
-    static_assert(sizeof(Type) == 16);
+    static_assert(sizeof(Type) == 16, "is sizeof(uint4) = 16?");
 };
 
 template<>
 struct BytesToType<8> {
     using Type = uint64_t;
-    static_assert(sizeof(Type) == 8);
+    static_assert(sizeof(Type) == 8, "is sizeof(uint64_t) = 8?");
 };
 
 template<>
 struct BytesToType<4> {
     using Type = uint32_t;
-    static_assert(sizeof(Type) == 4);
+    static_assert(sizeof(Type) == 4, "is sizeof(uint32_t) = 4?");
 };
 
 template<>
 struct BytesToType<2> {
     using Type = uint16_t;
-    static_assert(sizeof(Type) == 2);
+    static_assert(sizeof(Type) == 2, "is sizeof(uint16_t) = 2?");
 };
 
 template<>
 struct BytesToType<1> {
     using Type = uint8_t;
-    static_assert(sizeof(Type) == 1);
+    static_assert(sizeof(Type) == 1, "is sizeof(uint8_t) = 8?");
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -188,10 +205,17 @@ struct TypeToVec2<half> {
     using Type = half2;
 };
 
+#ifdef USE_ROCM
+template<>
+struct TypeToVec2<hip_bfloat16> {
+    using Type = hip_bfloat162;
+};
+#else
 template<>
 struct TypeToVec2<nv_bfloat16> {
     using Type = nv_bfloat162;
 };
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -241,6 +265,15 @@ struct Converter<float2, half2>{
     }
 };
 
+#ifdef USE_ROCM
+template<>
+struct Converter<float2, hip_bfloat162>{
+    static inline __device__ hip_bfloat162 convert(const float2 &x) {
+        hip_bfloat162 raw;
+        return raw;
+    }
+};
+#else
 template<>
 struct Converter<float2, nv_bfloat162>{
     static inline __device__ nv_bfloat162 convert(const float2 &x) {
@@ -258,6 +291,7 @@ struct Converter<float2, nv_bfloat162>{
 #endif
     }
 };
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -331,10 +365,19 @@ struct InterCTASync {
     }
 
     inline __device__ void spin_wait_(int *barrier, int step, int expected) {
+#ifdef USE_ROCM
+        atomicAdd(barrier, step);
+        for( int found = -1; found != expected; ) {
+            found = *barrier;
+        }
+#else
+        // reduction operation on global and shared memory
+        // reduction with operand b and value in a, store result at location a, overwriting original value
         asm volatile("red.release.gpu.global.add.s32 [%0], %1;" ::"l"(barrier), "r"(step));
         for( int found = -1; found != expected; ) {
             asm volatile("ld.global.acquire.gpu.b32 %0, [%1];" : "=r"(found) : "l"(barrier));
         }
+#endif
     }
 
     inline __device__ void sync(){
@@ -399,7 +442,11 @@ struct Reducer : public Reducer<T, 1, WARPS_M, WARPS_N> {
             workspace[bidn_] = data;
         }
         inter_cta_.sync();
-        static_assert(CTAS_PER_ROW <= 32);
+#ifdef USE_ROCM
+        static_assert(CTAS_PER_ROW <= warpSize, "CTAS_PER_ROW <= warpSize.");
+#else
+        static_assert(CTAS_PER_ROW <= 32, "CTAS_PER_ROW <= 32.");
+#endif
         T total = Zeros<T>::get();
         if(this->lane_ < CTAS_PER_ROW){
             total = workspace[this->lane_];
@@ -425,7 +472,11 @@ struct Reducer<T, 1, WARPS_M, 1> {
     enum { SMEM_BYTES = 0 };
     enum { WORKSPACE_BYTES_PER_GROUP = 0 };
 
+#ifdef USE_ROCM
+    enum { THREADS_PER_WARP = 64 };
+#else
     enum { THREADS_PER_WARP = 32 };
+#endif
 
     template<typename Params>
     inline __device__ Reducer(Params & params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane, void * smem) 
@@ -473,7 +524,11 @@ struct Reducer<T, 1, WARPS_M, WARPS_N> : public Reducer<T, 1, WARPS_M, 1> {
     enum { SMEM_BYTES = Base::SMEM_BYTES + WARPS_M * WARPS_N * sizeof(T) * 2 };
     enum { WORKSPACE_BYTES_PER_GROUP = 0 };
 
+#ifdef USE_ROCM
+    enum { THREADS_PER_WARP = 64 };
+#else
     enum { THREADS_PER_WARP = 32 };
+#endif
 
     template<typename Params>
     inline __device__ Reducer(Params & params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane, void * smem) 
@@ -553,8 +608,13 @@ inline __device__ void warp_chan_upd_dynamic(T &m_a, T &m2_a, T &n_a, int num_ac
         m2_a = m2_ab;
     }
     // Intra-warp broadcast (only lane 0 has valid stats).
+#ifdef USE_ROCM
+    m_a = __shfl(m_a, 0);
+    m2_a = __shfl(m2_a, 0);
+#else
     m_a = __shfl_sync(uint32_t(-1), m_a, 0);
     m2_a = __shfl_sync(uint32_t(-1), m2_a, 0);
+#endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -602,7 +662,11 @@ struct Stats {
         T m2 = Zeros<T>::get();
 
         // Assume CTA group size in N less than 32, such that we can finalize with a single warp.
-        static_assert(CTAS_PER_ROW <= 32);
+#ifdef USE_ROCM
+        static_assert(CTAS_PER_ROW <= warpSize, "CTAS_PER_ROW <= warpSize.");
+#else
+        static_assert(CTAS_PER_ROW <= 32, "CTAS_PER_ROW <= 32.");
+#endif
 
         // Every warp does the final reduction locally. 
         if( lane_ < CTAS_PER_ROW ) {
@@ -667,7 +731,11 @@ struct Stats<T, 1, WARPS_M, WARPS_N> {
         T m2 = Zeros<T>::get();
 
         // Assume that there are less than 32 warps, such that we can finalize with a single warp
-        static_assert(WARPS_N <= 32);
+#ifdef USE_ROCM
+        static_assert(WARPS_N <= warpSize, "CTAS_PER_ROW <= warpSize.");
+#else
+        static_assert(WARPS_N <= 32, "CTAS_PER_ROW <= 32.");
+#endif
         if(lane < WARPS_N){
             stats_t result = smem[lane];
             n = N * THREADS_PER_WARP;
diff --git a/apex/contrib/layer_norm/layer_norm.py b/apex/contrib/layer_norm/layer_norm.py
index b084b1ace..8af044579 100644
--- a/apex/contrib/layer_norm/layer_norm.py
+++ b/apex/contrib/layer_norm/layer_norm.py
@@ -1,48 +1,69 @@
+import importlib
+import numbers
+
 import torch
 from torch.nn import init
+from torch.nn import functional as F
 
 from apex._autocast_utils import _cast_if_autocast_enabled
 import fast_layer_norm
+import fused_layer_norm_cuda
+
+#global fused_layer_norm_cuda
+#fused_layer_norm_cuda = None
 
 
 class FastLayerNormFN(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, x, gamma, beta, epsilon):
+    def forward(ctx, x, gamma, beta, normalized_shape, epsilon):
         x = x.contiguous()
-        gamma = gamma.contiguous()
-        beta = beta.contiguous()
+        gamma_ = gamma.contiguous()
+        beta_ = beta.contiguous()
         hidden_size = gamma.numel()
         xmat = x.view((-1, hidden_size))
         ymat, mu, rsigma = fast_layer_norm.ln_fwd(xmat, gamma, beta, epsilon)
-        ctx.save_for_backward(x, gamma, mu, rsigma)
+        ctx.normalized_shape = normalized_shape
+        ctx.eps = epsilon
+        ctx.save_for_backward(x, gamma_, beta_, mu, rsigma)
         return ymat.view(x.shape)
 
     @staticmethod
     def backward(ctx, dy):
         # assert dy.is_contiguous()
-        dy = dy.contiguous()  # this happens!
-        x, gamma, mu, rsigma = ctx.saved_tensors
-
+        dy_ = dy.contiguous()  # this happens!
+        x, gamma, beta, mu, rsigma= ctx.saved_tensors
+        #x, gamma, mu, rsigma= ctx.saved_tensors
         hidden_size = gamma.numel()
         xmat = x.view((-1, hidden_size))
-        dymat = dy.view(xmat.shape)
-        dxmat, dgamma, dbeta, _, _ = fast_layer_norm.ln_bwd(dymat, xmat, mu, rsigma, gamma)
-        dx = dxmat.view(x.shape)
-        return dx, dgamma, dbeta, None
+        dymat = dy_.view(xmat.shape)
+        dxmat = dgamma = dbeta = None
+        if hidden_size > 12288:
+            #dxmat = fused_layer_norm_cuda.backward(dy_, mu, rsigma, xmat, ctx.normalized_shape, ctx.eps)
+            dxmat, dgamma, dbeta = fused_layer_norm_cuda.backward_affine(
+                dymat, mu, rsigma, xmat, ctx.normalized_shape, gamma, beta, ctx.eps
+            )
+            dx = dxmat.view(x.shape)
+        else:
+            dxmat, dgamma, dbeta, _, _ = fast_layer_norm.ln_bwd(dymat, xmat, mu, rsigma, gamma)
+            dx = dxmat.view(x.shape)
+        return dx, dgamma, dbeta, None, None
 
 
-def _fast_layer_norm(x, weight, bias, epsilon):
-    args = _cast_if_autocast_enabled(x, weight, bias, epsilon)
+def _fast_layer_norm(x, weight, bias, normalized_shape, epsilon):
+    args = _cast_if_autocast_enabled(x, weight, bias, normalized_shape, epsilon)
     with torch.cuda.amp.autocast(enabled=False):
         return FastLayerNormFN.apply(*args)
 
 
 class FastLayerNorm(torch.nn.Module):
-    def __init__(self, hidden_size, eps=1e-5):
+    def __init__(self, normalized_shape, eps=1e-5):
         super().__init__()
         self.epsilon = eps
-        self.weight = torch.nn.Parameter(torch.empty(hidden_size))
-        self.bias = torch.nn.Parameter(torch.empty(hidden_size))
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = torch.Size(normalized_shape)
+        self.weight = torch.nn.Parameter(torch.Tensor(*normalized_shape))
+        self.bias = torch.nn.Parameter(torch.Tensor(*normalized_shape))
         self.reset_parameters()
 
     def reset_parameters(self):
@@ -50,4 +71,7 @@ def reset_parameters(self):
         init.zeros_(self.bias)
 
     def forward(self, x):
-        return _fast_layer_norm(x, self.weight, self.bias, self.epsilon)
+        if not x.is_cuda:
+            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.epsilon)
+        else:
+            return _fast_layer_norm(x, self.weight, self.bias, self.normalized_shape, self.epsilon)
diff --git a/apex/contrib/test/layer_norm/test_fast_layer_norm.py b/apex/contrib/test/layer_norm/test_fast_layer_norm.py
index 089ec35d2..11a309bf9 100644
--- a/apex/contrib/test/layer_norm/test_fast_layer_norm.py
+++ b/apex/contrib/test/layer_norm/test_fast_layer_norm.py
@@ -6,6 +6,7 @@
 import torch
 
 import fast_layer_norm as fln
+import apex
 from apex.contrib.layer_norm.layer_norm import FastLayerNorm
 
 
@@ -191,6 +192,89 @@ def check_err(x, relerr):
         for x, re in zip([z, mu, rs, dx, dg, db], [re_z, re_mu, re_rs, re_dx, re_dg, re_db])
     ]
 
+def test_withfused(S, B, hidden_size, itype=fp32, wtype=fp32, ctype=fp32):
+
+    seed = 1243
+    time = 0
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    fwd_thresholds_fp16 = dict(rtol=1e-3, atol=1e-3)
+    bwd_thresholds_fp16 = dict(rtol=1e-3, atol=1e-3)
+    fwd_thresholds_fp32 = dict(rtol=1e-6, atol=1e-6)
+    bwd_thresholds_fp32 = dict(rtol=1e-6, atol=1e-6)
+    fwd_thresholds_bf16 = dict(rtol=1e-4, atol=1e-4)
+    bwd_thresholds_bf16 = dict(rtol=1e-4, atol=1e-4)
+
+    otype = wtype
+    print("========================================================")
+    print(f"S={S} B={B} Hidden={hidden_size} {itype} {wtype}")
+    print("--------------------------------------------------------")
+
+    x = torch.randn(S * B, hidden_size, dtype=itype, device=device)
+    gamma = torch.randn(hidden_size, dtype=wtype, device=device) * 0.2
+    beta = torch.randn(hidden_size, dtype=wtype, device=device) * 0.2
+    epsilon = 1e-5
+
+    x.requires_grad = True
+    gamma.requires_grad = True
+    beta.requires_grad = True
+
+    mu_ref = x.mean(1, dtype=ctype, keepdim=True)
+    v = torch.square(x - mu_ref).mean(1, dtype=ctype, keepdim=True)
+    rs_ref = torch.rsqrt(v + epsilon)
+    y_ref = rs_ref * (x.to(ctype) - mu_ref)
+    z_ref = (gamma.unsqueeze(0) * (y_ref).to(otype) + beta.unsqueeze(0)).to(otype)
+
+    mu_ref = mu_ref.flatten()
+    rs_ref = rs_ref.flatten()
+
+    dz = torch.randn_like(z_ref)
+
+    # z_ref.backward(dz)
+    # dx_ref = x.grad
+    # dgamma_ref = gamma.grad
+    # dbeta_ref = beta.grad
+    dx_ref, dg_ref, db_ref = backward_(dz, x, mu_ref, rs_ref, gamma)
+
+    torch.cuda.manual_seed(seed)
+    x = torch.randn(S * B, hidden_size, dtype=itype, device=device)
+    x_cpu = x.detach().requires_grad_(True)
+    gamma_cpu = torch.randn(hidden_size, dtype=wtype, device=device) * 0.2
+    beta_cpu = torch.randn(hidden_size, dtype=wtype, device=device) * 0.2
+    x_cuda = x.to(device="cuda", dtype=itype).detach().requires_grad_(True)
+    gamma_cuda = gamma_cpu.to(device="cuda", dtype=wtype).detach().requires_grad_(True)
+    beta_cuda = beta_cpu.to(device="cuda", dtype=wtype).detach().requires_grad_(True)
+    x_cpu_ = x_cpu.contiguous()
+    x_cuda_ = x_cuda.contiguous()
+    
+    module_cpu_  = apex.normalization.MixedFusedLayerNorm(normalized_shape=[hidden_size]).to(device="cuda", dtype=itype)
+    module_cuda_ = FastLayerNorm(normalized_shape=[hidden_size]).to(device="cuda", dtype=itype)
+    
+    out_cpu_ = module_cpu_(x_cpu_)
+    gO = torch.rand_like(out_cpu_)
+    out_cpu_.backward(gO)
+
+    # x_ = x_.to(device="cuda", dtype=itype)
+    out_cuda_ = module_cuda_(x_cuda_)
+    gO = gO.to(device="cuda", dtype=itype)
+    out_cuda_.backward(gO)
+    
+    if itype == fp16:
+        print(itype, "out test : ", 
+                torch.testing.assert_close(out_cpu_.to(device="cuda", dtype=itype), out_cuda_, **fwd_thresholds_fp16))
+        print(itype, "grad test : ", 
+                torch.testing.assert_close(x_cpu_.grad.to(device="cuda", dtype=itype), x_cuda_.grad, **bwd_thresholds_fp16))
+    elif itype == bf16:
+        print(itype, "out test : ", 
+                torch.testing.assert_close(out_cpu_.to(device="cuda", dtype=itype), out_cuda_, **fwd_thresholds_bf16))
+        print(itype, "grad test : ", 
+                torch.testing.assert_close(x_cpu_.grad.to(device="cuda", dtype=itype), x_cuda_.grad, **bwd_thresholds_bf16))
+    else:
+        print(itype, "out test : ", 
+                torch.testing.assert_close(out_cpu_.to(device="cuda", dtype=itype), out_cuda_, **fwd_thresholds_fp32))
+        print(itype, "grad test : ", 
+                torch.testing.assert_close(x_cpu_.grad.to(device="cuda", dtype=itype), x_cuda_.grad, **bwd_thresholds_fp32))
+
 
 class TestFastLayerNorm(unittest.TestCase):
     def assertAll(self, l):
@@ -232,11 +316,18 @@ def test_all_configs(self):
 
         for h in hidden_sizes:
             with self.subTest(f"hidden_size={h}"):
-                self.assertAll(test_(256, 2, h, fp32, fp32))
-                self.assertAll(test_(256, 2, h, fp16, fp16))
-                self.assertAll(test_(256, 2, h, fp32, fp16))
-                self.assertAll(test_(256, 2, h, bf16, bf16))
-                self.assertAll(test_(256, 2, h, fp32, bf16))
+                if h > 12288:
+                    self.assertAll(test_withfused(256, 2, h, fp32, fp32))
+                    self.assertAll(test_withfused(256, 2, h, fp16, fp16))
+                    self.assertAll(test_withfused(256, 2, h, fp32, fp16))
+                    # self.assertAll(test_withfused(256, 2, h, bf16, bf16))
+                    # self.assertAll(test_withfused(256, 2, h, fp32, bf16))
+                else:
+                    self.assertAll(test_(256, 2, h, fp32, fp32))
+                    self.assertAll(test_(256, 2, h, fp16, fp16))
+                    self.assertAll(test_(256, 2, h, fp32, fp16))
+                    # self.assertAll(test_(256, 2, h, bf16, bf16))
+                    # self.assertAll(test_(256, 2, h, fp32, bf16))
 
     def test_run_benchmark(self):
         for (S, B, hidden_size, runs) in (
diff --git a/setup.py b/setup.py
index 9a958760f..f1c34b304 100644
--- a/setup.py
+++ b/setup.py
@@ -45,7 +45,6 @@ def get_cuda_bare_metal_version(cuda_dir):
 
     return raw_output, bare_metal_major, bare_metal_minor
 
-
 def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
     raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(cuda_dir)
     torch_binary_major = torch.version.cuda.split(".")[0]
@@ -428,25 +427,33 @@ def check_if_rocm_pytorch():
 if os.path.exists(os.path.join(torch_dir, "include", "ATen", "CUDAGeneratorImpl.h")):
     generator_flag = ["-DOLD_GENERATOR_PATH"]
 
-if "--fast_layer_norm" in sys.argv:
-    sys.argv.remove("--fast_layer_norm")
-    raise_if_cuda_home_none("--fast_layer_norm")
+if "--fast_layer_norm" in sys.argv or "--cuda_ext" in sys.argv:
+    if "--fast_layer_norm" in sys.argv:
+        sys.argv.remove("--fast_layer_norm")
+
+    if not IS_ROCM_PYTORCH:
+        raise_if_cuda_home_none("--fast_layer_norm")
     # Check, if CUDA11 is installed for compute capability 8.0
     cc_flag = []
-    _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME)
-    if int(bare_metal_major) >= 11:
-        cc_flag.append("-gencode")
-        cc_flag.append("arch=compute_80,code=sm_80")
-
-    if CUDA_HOME is None and not IS_ROCM_PYTORCH:
-        raise RuntimeError("--fast_layer_norm was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
-    else:
-        # Check, if CUDA11 is installed for compute capability 8.0
-        cc_flag = []
+    if not IS_ROCM_PYTORCH:
         _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME)
         if int(bare_metal_major) >= 11:
-            cc_flag.append('-gencode')
-            cc_flag.append('arch=compute_80,code=sm_80')
+            cc_flag.append("-gencode")
+            cc_flag.append("arch=compute_80,code=sm_80")
+
+    nvcc_args_fast_layer_norm = ['-maxrregcount=50', '-O3', '--use_fast_math'] + version_dependent_macros
+    hipcc_args_fast_layer_norm = ['-O3', '-U__HIP_NO_HALF_OPERATORS__', '-U__HIP_NO_HALF_CONVERSIONS__'] + version_dependent_macros
+    
+    ext_modules.append(
+        CUDAExtension(name='fast_layer_norm',
+                      sources=['apex/contrib/csrc/layer_norm/ln_api.cpp',
+                               'apex/contrib/csrc/layer_norm/ln_bwd_semi_cuda_kernel.cu',
+                               'apex/contrib/csrc/layer_norm/ln_fwd_cuda_kernel.cu',
+                              ],
+                      include_dirs=[os.path.join(this_dir, 'csrc'),
+                                    os.path.join(this_dir, 'apex/contrib/csrc/layer_norm')],
+                      extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
+                                          'nvcc': nvcc_args_fast_layer_norm if not IS_ROCM_PYTORCH else hipcc_args_fast_layer_norm}))
 
 if "--fmha" in sys.argv:
     sys.argv.remove("--fmha")