diff --git a/src/enqueue.cc b/src/enqueue.cc index 28419c959..6329e3554 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -69,8 +69,10 @@ int ncclGetKernelIndex(struct ncclComm* comm) { #endif hipDeviceProp_t devProp; CUDACHECK(hipGetDeviceProperties(&devProp, comm->cudaDev)); + // Use UNROLL=2 for gfx908 and gfx942 with greater than 96 CUs + // Else, use UNROLL=4 if(IsArchMatch(devProp.gcnArchName, "gfx908") || (IsArchMatch(devProp.gcnArchName, "gfx94") - && devProp.multiProcessorCount > 80)) + && devProp.multiProcessorCount > 96)) return start_idx; else return start_idx + 1; diff --git a/src/graph/connect.cc b/src/graph/connect.cc index 328407246..7520faa36 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -626,7 +626,9 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa // Only use full MAXCHANNELS for gfx94x int maxChannels = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? - ((comm->topo->nodes[GPU].nodes[0].gpu.cu == 80 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 20 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 38) + ((comm->topo->nodes[GPU].nodes[0].gpu.cu == 38 || + comm->topo->nodes[GPU].nodes[0].gpu.cu == 80 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 20 || + comm->topo->nodes[GPU].nodes[0].gpu.cu == 96 || comm->topo->nodes[GPU].nodes[0].gpu.cu == 24) ? comm->topo->nodes[GPU].nodes[0].gpu.cu : MAXCHANNELS) : 2*CHANNEL_LIMIT; if (graphs[NCCL_ALGO_RING]->nIntraChannels > 0 || comm->nNodes > 1) {