From 359f3eb0e4e06fa05e40ac5df7f019462270f7d4 Mon Sep 17 00:00:00 2001
From: sstamenk <strahinja.stamenkovic@amd.com>
Date: Thu, 30 Jan 2025 23:13:03 +0100
Subject: [PATCH] Add gfx1101 support

---
 .../optimizers/auto_mixed_precision.cc        |  5 +++--
 .../optimizers/generic_layout_optimizer.cc    | 16 +++++++-------
 tensorflow/core/util/gpu_device_functions.h   |  6 +++---
 tensorflow/tools/ci_build/Dockerfile.rocm     |  2 +-
 .../Dockerfile.rocm.manylinux2014             |  2 +-
 .../Dockerfile.rocm.manylinux_2_28            |  2 +-
 .../Dockerfile.rocm.ub20                      |  2 +-
 .../Dockerfile.rocm.ub22                      |  2 +-
 .../Dockerfile.rocm.ub24                      |  2 +-
 .../tf_sig_build_dockerfiles/setup.rocm.sh    |  2 +-
 .../gpu/llvm_gpu_backend/gpu_backend_lib.cc   |  3 ++-
 .../xla/stream_executor/device_description.h  | 21 ++++++++++---------
 12 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index 089b68d42f924b..7b75f3cbe9584b 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -116,8 +116,9 @@ bool HasFastFP16Support(const DeviceProperties& props) {
 #elif TENSORFLOW_USE_ROCM
   absl::flat_hash_set<std::string> FP16SupportedDevices = {
       {"gfx906"}, {"gfx908"}, {"gfx90a"}, {"gfx910"}, {"gfx940"}, {"gfx941"},
-      {"gfx942"}, {"gfx1010"}, {"gfx1012"}, {"gfx1030"}, {"gfx1100"},
-      {"gfx1200"},{"gfx1201"}, {"gfx1102"}
+      {"gfx942"}, {"gfx1010"}, {"gfx1012"}, {"gfx1030"},
+      {"gfx1100"}, {"gfx1101"}, {"gfx1102"},
+      {"gfx1200"}, {"gfx1201"}
   };
   std::string gcnArchName = props.environment().at("architecture");
   std::vector<std::string> gpu_arch = absl::StrSplit(gcnArchName, ":");
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
index 075f8e9e437596..a724a8085b4eb4 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc
@@ -70,13 +70,15 @@ inline GpuStats GetNumGPUs(const Cluster& cluster) {
     bool is_enabled = se::gpu::UseNhwcLayoutForRocm();
     if ((compute_capability_it->second == "gfx908" ||
          compute_capability_it->second == "gfx90a" ||
-	 compute_capability_it->second == "gfx940" ||
-	 compute_capability_it->second == "gfx941" ||
-	 compute_capability_it->second == "gfx942" ||
-   compute_capability_it->second == "gfx1200" ||
-   compute_capability_it->second == "gfx1201" ||
-   compute_capability_it->second == "gfx1102") && is_enabled) {
-       gpu_stats.num_voltas++;
+         compute_capability_it->second == "gfx940" ||
+         compute_capability_it->second == "gfx941" ||
+         compute_capability_it->second == "gfx942" ||
+         compute_capability_it->second == "gfx1101" ||
+         compute_capability_it->second == "gfx1102" ||
+         compute_capability_it->second == "gfx1200" ||
+         compute_capability_it->second == "gfx1201") &&
+        is_enabled) {
+      gpu_stats.num_voltas++;
     }
 #endif
     if (compute_capability_it == device.second.environment().end()) {
diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h
index 7d35df67d6b986..39c9d9d0bfb432 100644
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@@ -743,7 +743,7 @@ __device__ inline double GpuAtomicAdd(double* ptr, double value) {
 }
 #endif
 
-#if __gfx908__ || __gfx90a__ || __gfx940__ || __gfx941__ || __gfx942__ || __gfx1200__ || __gfx1201__ || __gfx1102__
+#if __gfx908__ || __gfx90a__ || __gfx940__ || __gfx941__ || __gfx942__ || __gfx1101__ || __gfx1102__ || __gfx1200__ || __gfx1201__
 
 #define ADDRSP1 __attribute__((address_space(1)))
 __device__ float
@@ -963,7 +963,7 @@ __device__ inline int64_t GpuAtomicMin(int64_t* ptr, int64_t value) {
 }
 #endif
 
-#if __gfx908__ || __gfx90a__ || __gfx940__ || __gfx941__ || __gfx942__ || __gfx1200__ || __gfx1201__ || __gfx1102__
+#if __gfx908__ || __gfx90a__ || __gfx940__ || __gfx941__ || __gfx942__ || __gfx1101__ || __gfx1102__ || __gfx1200__ || __gfx1201__
 // Low level instructions don't return. For now, assume that return value
 // is always unused.
 __device__ float GpuAtomicAdd(float* dst, float val) {
@@ -978,7 +978,7 @@ __device__ inline T GpuAtomicAddShared(T* ptr, T value) {
   return GpuAtomicAdd(ptr, value);
 }
 
-#if __gfx908__ || __gfx90a__ || __gfx940__ || __gfx941__ || __gfx942__ || __gfx1200__ || __gfx1201__ || __gfx1102__
+#if __gfx908__ || __gfx90a__ || __gfx940__ || __gfx941__ || __gfx942__ || __gfx1101__ || __gfx1102__ || __gfx1200__ || __gfx1201__
 __device__ float GpuAtomicAddShared(float* dst, float val) {
   atomicAdd(dst, val);
   return val;
diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm
index 9e416399c8f206..e27fdf75b6f978 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rocm
+++ b/tensorflow/tools/ci_build/Dockerfile.rocm
@@ -7,7 +7,7 @@ ARG ROCM_DEB_REPO=https://repo.radeon.com/rocm/apt/6.2/
 ARG ROCM_BUILD_NAME=ubuntu
 ARG ROCM_BUILD_NUM=main
 ARG ROCM_PATH=/opt/rocm/
-ARG GPU_DEVICE_TARGETS="gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1200 gfx1201 gfx1102"
+ARG GPU_DEVICE_TARGETS="gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1200 gfx1201"
 
 ARG DEBIAN_FRONTEND=noninteractive
 ENV TF_NEED_ROCM 1
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.manylinux2014 b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.manylinux2014
index 08db953ce95727..bb754568d66c02 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.manylinux2014
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.manylinux2014
@@ -8,7 +8,7 @@ COPY setup.packages.rocm.cs7.sh setup.packages.rocm.cs7.sh
 COPY builder.packages.rocm.cs7.txt builder.packages.rocm.cs7.txt
 RUN /setup.packages.rocm.cs7.sh /builder.packages.rocm.cs7.txt
 
-ARG GPU_DEVICE_TARGETS="gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1200 gfx1201 gfx1102"
+ARG GPU_DEVICE_TARGETS="gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1200 gfx1201"
 ENV GPU_DEVICE_TARGETS=${GPU_DEVICE_TARGETS}
 
 # Install ROCM
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.manylinux_2_28 b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.manylinux_2_28
index 5abf3782f27f64..2ce772270d375f 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.manylinux_2_28
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.manylinux_2_28
@@ -17,7 +17,7 @@ COPY setup.packages.rocm.el8.sh setup.packages.rocm.el8.sh
 COPY builder.packages.rocm.el8.txt builder.packages.rocm.el8.txt
 RUN /setup.packages.rocm.el8.sh /builder.packages.rocm.el8.txt
 
-ARG GPU_DEVICE_TARGETS="gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1200 gfx1201 gfx1102"
+ARG GPU_DEVICE_TARGETS="gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1200 gfx1201"
 ENV GPU_DEVICE_TARGETS=${GPU_DEVICE_TARGETS}
 
 # Install ROCM
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.ub20 b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.ub20
index c99e425228d36b..6dddb73d1b49a6 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.ub20
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.ub20
@@ -2,7 +2,7 @@
 FROM ubuntu:20.04
 ################################################################################
 
-ARG GPU_DEVICE_TARGETS="gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1200 gfx1201 gfx1102"
+ARG GPU_DEVICE_TARGETS="gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1200 gfx1201"
 ENV GPU_DEVICE_TARGETS=${GPU_DEVICE_TARGETS}
 
 # Install build dependencies
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.ub22 b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.ub22
index 1249aef65406c7..a3a8edddd0f909 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.ub22
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.ub22
@@ -2,7 +2,7 @@
 FROM ubuntu:22.04
 ################################################################################
 
-ARG GPU_DEVICE_TARGETS="gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1200 gfx1201 gfx1102"
+ARG GPU_DEVICE_TARGETS="gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1200 gfx1201"
 ENV GPU_DEVICE_TARGETS=${GPU_DEVICE_TARGETS}
 
 # Install build dependencies
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.ub24 b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.ub24
index 0285ba7a56601c..45c6bbe00030fe 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.ub24
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile.rocm.ub24
@@ -2,7 +2,7 @@
 FROM ubuntu:24.04
 ################################################################################
 
-ARG GPU_DEVICE_TARGETS="gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102"
+ARG GPU_DEVICE_TARGETS="gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1200 gfx1201"
 ENV GPU_DEVICE_TARGETS=${GPU_DEVICE_TARGETS}
 
 # Install build dependencies
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/setup.rocm.sh b/tensorflow/tools/tf_sig_build_dockerfiles/setup.rocm.sh
index 266d0399e003c3..668ce5f5ccb777 100755
--- a/tensorflow/tools/tf_sig_build_dockerfiles/setup.rocm.sh
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/setup.rocm.sh
@@ -145,7 +145,7 @@ then
   echo "build:rocm_base --copt=-fclang-abi-compat=17" >> /etc/bazel.bazelrc
 fi
 
-GPU_DEVICE_TARGETS=${GPU_DEVICE_TARGETS:-"gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1200 gfx1201"}
+GPU_DEVICE_TARGETS=${GPU_DEVICE_TARGETS:-"gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1200 gfx1201"}
 
 echo $ROCM_VERSION
 echo $ROCM_REPO
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 0480d6fb77745b..432bd35a731dee 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -974,7 +974,8 @@ std::string MapGCNArchNameTokenToFeatureStr(const std::string& token,
     return "+sramecc";
   } else if (token == "sramecc-") {
     if (gfx == "gfx90a" || gfx == "gfx940" || gfx == "gfx941" ||
-        gfx == "gfx942" ||  gfx == "gfx1200" || gfx == "gfx1201" || gfx == "gfx1102")
+        gfx == "gfx942" || gfx == "gfx1101" || gfx == "gfx1102" ||
+        gfx == "gfx1200" || gfx == "gfx1201")
       return "";
     return "-sramecc";
   } else if (token == "xnack+") {
diff --git a/third_party/xla/xla/stream_executor/device_description.h b/third_party/xla/xla/stream_executor/device_description.h
index 85282acb8cedc8..c36ed5e990ab43 100644
--- a/third_party/xla/xla/stream_executor/device_description.h
+++ b/third_party/xla/xla/stream_executor/device_description.h
@@ -188,8 +188,9 @@ class RocmComputeCapability {
 
   bool gfx10_rx69xx() const { return gfx_version() == "gfx1030"; }
 
-  bool gfx11_rx7900() const { return (gfx_version() == "gfx1100") ||
-                                      (gfx_version() == "gfx1102"); }
+  bool gfx11_rx7900() const { return (gfx_version() == "gfx1100" ||
+                                      gfx_version() == "gfx1101" ||
+                                      gfx_version() == "gfx1102"); }
 
   bool gfx12_rx8900() const { return ((gfx_version() == "gfx1200") || 
                                       (gfx_version() == "gfx1201")); }
@@ -237,14 +238,14 @@ class RocmComputeCapability {
   std::string gcn_arch_name_ = "gfx000";  // default to invalid arch.
 
   static constexpr absl::string_view kSupportedGfxVersions[]{
-      "gfx900",                       // MI25
-      "gfx906",                       // MI50 / MI60
-      "gfx908",                       // MI100
-      "gfx90a",                       // MI200
-      "gfx940",  "gfx941", "gfx942",  // MI300
-      "gfx1030",                      // RX68xx / RX69xx
-      "gfx1100", "gfx1102",            // RX7900
-      "gfx1200", "gfx1201",            // RX8900
+      "gfx900",                         // MI25
+      "gfx906",                         // MI50 / MI60
+      "gfx908",                         // MI100
+      "gfx90a",                         // MI200
+      "gfx940",  "gfx941", "gfx942",    // MI300
+      "gfx1030",                        // RX68xx / RX69xx
+      "gfx1100", "gfx1101", "gfx1102",  // RX7900
+      "gfx1200", "gfx1201",             // RX8900
   };
 };