diff --git a/src/enqueue.cc b/src/enqueue.cc
index 28419c959..6329e3554 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -69,8 +69,10 @@ int ncclGetKernelIndex(struct ncclComm* comm) {
 #endif
   hipDeviceProp_t devProp;
   CUDACHECK(hipGetDeviceProperties(&devProp, comm->cudaDev));
+  // Use UNROLL=2 for gfx908 and gfx942 with greater than 96 CUs
+  // Else, use UNROLL=4
   if(IsArchMatch(devProp.gcnArchName, "gfx908") || (IsArchMatch(devProp.gcnArchName, "gfx94")
-    && devProp.multiProcessorCount > 80))
+    && devProp.multiProcessorCount > 96))
     return start_idx;
   else
     return start_idx + 1;
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index 328407246..7520faa36 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -626,7 +626,9 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
 
   // Only use full MAXCHANNELS for gfx94x
   int maxChannels = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ?
-    ((comm->topo->nodes[GPU].nodes[0].gpu.cu == 80 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 20 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 38)
+    ((comm->topo->nodes[GPU].nodes[0].gpu.cu == 38 ||
+      comm->topo->nodes[GPU].nodes[0].gpu.cu == 80 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 20 ||
+      comm->topo->nodes[GPU].nodes[0].gpu.cu == 96 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 24)
       ? comm->topo->nodes[GPU].nodes[0].gpu.cu : MAXCHANNELS) : 2*CHANNEL_LIMIT;
 
   if (graphs[NCCL_ALGO_RING]->nIntraChannels > 0 || comm->nNodes > 1) {