diff --git a/test/AllReduceTests.cpp b/test/AllReduceTests.cpp index 5d7d8cdfc..9eb8cdc5a 100644 --- a/test/AllReduceTests.cpp +++ b/test/AllReduceTests.cpp @@ -193,7 +193,7 @@ namespace RcclUnitTesting for (int isMultiProcess : testBed.ev.GetIsMultiProcessList()) { int const numProcesses = isMultiProcess ? totalRanks : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks)); + testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks)); for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) { diff --git a/test/AllToAllVTests.cpp b/test/AllToAllVTests.cpp index b8d1afd96..3941a5413 100644 --- a/test/AllToAllVTests.cpp +++ b/test/AllToAllVTests.cpp @@ -73,7 +73,7 @@ namespace RcclUnitTesting for (int isMultiProcess : testBed.ev.GetIsMultiProcessList()) { int const numProcesses = isMultiProcess ? totalRanks : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks)); + testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks)); // Prepare AllToAllV options std::vector numInputElements; @@ -130,7 +130,7 @@ namespace RcclUnitTesting for (int isMultiProcess : testBed.ev.GetIsMultiProcessList()) { int const numProcesses = isMultiProcess ? totalRanks : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks)); + testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks)); // Prepare AllToAllV options std::vector numInputElements; diff --git a/test/GroupCallTests.cpp b/test/GroupCallTests.cpp index 6bc01be05..5ba5787ad 100644 --- a/test/GroupCallTests.cpp +++ b/test/GroupCallTests.cpp @@ -28,7 +28,7 @@ namespace RcclUnitTesting { // Test either single process all GPUs, or 1 process per GPU int const numProcesses = isMultiProcess ? totalRanks : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup); + testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup); if (testBed.ev.showNames) INFO("%s %d-ranks GroupCall Identical\n", isMultiProcess ? "MP" : "SP", totalRanks); @@ -84,7 +84,7 @@ namespace RcclUnitTesting { // Test either single process all GPUs, or 1 process per GPU int const numProcesses = isMultiProcess ? totalRanks : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup); + testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup); if (testBed.ev.showNames) INFO("%s %d-ranks GroupCall Different\n", isMultiProcess ? "MP" : "SP", totalRanks); @@ -139,7 +139,7 @@ namespace RcclUnitTesting { // Test either single process all GPUs, or 1 process per GPU int const numProcesses = isMultiProcess ? totalRanks : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup); + testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup); if (testBed.ev.showNames) INFO("%s %d-ranks GroupCall MixedDayaType\n", isMultiProcess ? "MP" : "SP", totalRanks); @@ -194,7 +194,7 @@ namespace RcclUnitTesting INFO("%s %d-ranks Multistream %d-Group Calls across %d streams\n", isMultiProcess ? "MP" : "SP", totalRanks, numCollPerGroup, numStreamsPerGroup); - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), + testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup, numStreamsPerGroup); // Set up each collective in group in different stream (modulo numStreamsPerGroup) @@ -244,7 +244,7 @@ namespace RcclUnitTesting int const numProcesses = isMultiProcess ? totalRanks : 1; // Initialize comms by specifying the # of group calls - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollsPerGroup, numStreamsPerGroup, numGroupCalls, useBlocking); + testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks), numCollsPerGroup, numStreamsPerGroup, numGroupCalls, useBlocking); if (testBed.ev.showNames) INFO("%s %d-ranks GroupCall MultiGroupCall\n", isMultiProcess ? "MP" : "SP", totalRanks); diff --git a/test/NonBlockingTests.cpp b/test/NonBlockingTests.cpp index 5b505c186..3006b0e2c 100644 --- a/test/NonBlockingTests.cpp +++ b/test/NonBlockingTests.cpp @@ -34,7 +34,7 @@ namespace RcclUnitTesting { int const numProcesses = isMultiProcess ? totalRanks : 1; // Initialize communicators in non-blocking mode - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), 1, 1, 1, useBlocking); + testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks), 1, 1, 1, useBlocking); // Loop over various collective functions for (auto funcType : funcTypes) diff --git a/test/SendRecvTests.cpp b/test/SendRecvTests.cpp index 4f38470cc..70e709604 100644 --- a/test/SendRecvTests.cpp +++ b/test/SendRecvTests.cpp @@ -28,7 +28,7 @@ namespace RcclUnitTesting int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu; int totalRanks = numGpus * ranksPerGpu; int const numProcesses = isMultiProcess ? numGpus : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1); + testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1); for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx) @@ -106,7 +106,7 @@ namespace RcclUnitTesting int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu; int totalRanks = numGpus * ranksPerGpu; int const numProcesses = isMultiProcess ? numGpus : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1); + testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1); for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx) diff --git a/test/common/EnvVars.cpp b/test/common/EnvVars.cpp index d7fd50189..64fc9b5b0 100644 --- a/test/common/EnvVars.cpp +++ b/test/common/EnvVars.cpp @@ -9,6 +9,8 @@ #include #include #include +#include +#include namespace RcclUnitTesting { @@ -88,6 +90,117 @@ namespace RcclUnitTesting return TEST_SUCCESS; } + + std::string execCommand(const char* cmd) { + std::array buffer; + std::string result; + std::unique_ptr pipe(popen(cmd, "r"), pclose); + if (!pipe) { + throw std::runtime_error("popen() failed!"); + } + while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { + result += buffer.data(); + } + return result; + } + + + int getDevicePriority (std::vector *gpuPriorityOrder){ + // Prepare parent->child pipe + int pipefd[2]; + if (pipe(pipefd) == -1) { + ERROR("Unable to create parent->child pipe for getting the device priority vector.\n"); + return TEST_FAIL; + } + pid_t pid = fork(); + if (0 == pid) { + std::vector result; + try { + std::string log = execCommand("rocm-smi --showuniqueid"); + std::unordered_map> uniqueIdToGpuIndexes; + std::string::size_type pos = 0; + + while ((pos = log.find("GPU[", pos)) != std::string::npos) { + int gpuIndex = std::stoi(log.substr(pos + 4)); + std::string::size_type idPos = log.find("Unique ID:", pos); + if (idPos == std::string::npos) break; + std::string::size_type idEnd = log.find_first_of(" \n", idPos + 11); + std::string uniqueId = log.substr(idPos + 11, idEnd - (idPos + 11)); + uniqueIdToGpuIndexes[uniqueId].push_back(gpuIndex); + pos = log.find('\n', pos); + } + + // Create a vector of pairs for sorting unique IDs based on the number of associated GPUs + std::vector>> sortedIds(uniqueIdToGpuIndexes.begin(), uniqueIdToGpuIndexes.end()); + std::sort(sortedIds.begin(), sortedIds.end(), [](const auto& a, const auto& b) { + return a.second.size() > b.second.size(); + }); + + for (const auto& pair : sortedIds) { + result.insert(result.end(), pair.second.begin(), pair.second.end()); + } + } catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << std::endl; + return 1; + } + if (write(pipefd[1], result.data(), gpuPriorityOrder->size() * sizeof(int)) != gpuPriorityOrder->size() * sizeof(int)) return TEST_FAIL; + close(pipefd[0]); + close(pipefd[1]); + exit(EXIT_SUCCESS); + } + else { + int status; + if (read(pipefd[0], gpuPriorityOrder->data(), gpuPriorityOrder->size() * sizeof(int)) != gpuPriorityOrder->size() * sizeof(int)) return TEST_FAIL; + waitpid(pid, &status, 0); + assert(!status); + close(pipefd[0]); + close(pipefd[1]); + } + return TEST_SUCCESS; + return 0; + } + + + int getDeviceMode (bool *cpxMode){ + // Prepare parent->child pipe + int pipefd[2]; + if (pipe(pipefd) == -1) + { + ERROR("Unable to create parent->child pipe for getting the device mode\n"); + return TEST_FAIL; + } + pid_t pid = fork(); + if (0 == pid) + { + bool iscpxMode = false; + try { + std::string log = execCommand("rocm-smi --showcomputepartition"); + bool foundCPX = log.find("CPX") != std::string::npos; + if (foundCPX) { + iscpxMode = true; + } + } catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << std::endl; + return 1; + } + if (write(pipefd[1], &iscpxMode, sizeof(iscpxMode)) != sizeof(iscpxMode)) return TEST_FAIL; + close(pipefd[0]); + close(pipefd[1]); + exit(EXIT_SUCCESS); + } + else { + int status; + if (read(pipefd[0], cpxMode, sizeof(*cpxMode)) != sizeof(*cpxMode)) return TEST_FAIL; + waitpid(pid, &status, 0); + assert(!status); + close(pipefd[0]); + close(pipefd[1]); + } + return TEST_SUCCESS; + return 0; + } + + EnvVars::EnvVars() { // Collect number of GPUs available @@ -115,6 +228,18 @@ namespace RcclUnitTesting // Total number of reduction ops int numOps = ncclNumOps; + gpuPriorityOrder.resize(numDetectedGpus); + for(int i=0;i redOpStrings = GetEnvVarsList("UT_REDOPS"); for (auto s : redOpStrings) { diff --git a/test/common/EnvVars.hpp b/test/common/EnvVars.hpp index c20e82cf4..d3df0fa59 100644 --- a/test/common/EnvVars.hpp +++ b/test/common/EnvVars.hpp @@ -18,20 +18,21 @@ namespace RcclUnitTesting class EnvVars { public: - bool showNames; // List test case names during run [UT_SHOW_NAMES] - int minGpus; // Set the minimum number of GPUs to use [UT_MIN_GPUS] - int maxGpus; // Set the maximum number of GPUs to use [UT_MAX_GPUS] - bool onlyPow2Gpus; // Only allow power-of-2 # of GPUs [UT_POW2_GPUS] - int processMask; // Filter single/multi process [UT_PROCESS_MASK] - bool verbose; // Show verbose TestBed output for debug [UT_VERBOSE] - int printValues; // Print out input/output/expected arrays [UT_PRINT_VALUES] - int maxRanksPerGpu; // Number of ranks using the same GPU [UT_MAX_RANKS_PER_GPU] - bool showTiming; // Show timing per case at end [UT_SHOW_TIMING] - bool useInteractive; // Run in interactive mode [UT_INTERACTIVE] - int timeoutUs; // Set timeout for child in microseconds [UT_TIMEOUT_US] - bool useMultithreading; // Multi-thread single-process ranks [UT_MULTITHREAD] - bool isGfx94; // Detects if architecture is gfx94 - bool isGfx12; // Detects if architecture is gfx12 + bool showNames; // List test case names during run [UT_SHOW_NAMES] + int minGpus; // Set the minimum number of GPUs to use [UT_MIN_GPUS] + int maxGpus; // Set the maximum number of GPUs to use [UT_MAX_GPUS] + bool onlyPow2Gpus; // Only allow power-of-2 # of GPUs [UT_POW2_GPUS] + int processMask; // Filter single/multi process [UT_PROCESS_MASK] + bool verbose; // Show verbose TestBed output for debug [UT_VERBOSE] + int printValues; // Print out input/output/expected arrays [UT_PRINT_VALUES] + int maxRanksPerGpu; // Number of ranks using the same GPU [UT_MAX_RANKS_PER_GPU] + bool showTiming; // Show timing per case at end [UT_SHOW_TIMING] + bool useInteractive; // Run in interactive mode [UT_INTERACTIVE] + int timeoutUs; // Set timeout for child in microseconds [UT_TIMEOUT_US] + bool useMultithreading; // Multi-thread single-process ranks [UT_MULTITHREAD] + bool isGfx94; // Detects if architecture is gfx94 + bool isGfx12; // Detects if architecture is gfx12 + std::vector gpuPriorityOrder; // Orders the gpus based on the associativity of them with OAM with higher gpus linked. // Constructor that parses and collects environment variables EnvVars(); diff --git a/test/common/TestBed.cpp b/test/common/TestBed.cpp index 1421ff5c9..445f762d6 100644 --- a/test/common/TestBed.cpp +++ b/test/common/TestBed.cpp @@ -193,7 +193,7 @@ namespace RcclUnitTesting void TestBed::InitComms(int const numGpus, int const numCollectivesInGroup, int const numStreamsPerGroup, int const numGroupCalls, bool const useBlocking) { - InitComms(TestBed::GetDeviceIdsList(1, numGpus), TestBed::GetNumCollsPerGroup(numCollectivesInGroup, numGroupCalls), TestBed::GetNumStreamsPerGroup(numStreamsPerGroup, numGroupCalls), numGroupCalls, useBlocking); + InitComms(GetDeviceIdsList(1, numGpus), TestBed::GetNumCollsPerGroup(numCollectivesInGroup, numGroupCalls), TestBed::GetNumStreamsPerGroup(numStreamsPerGroup, numGroupCalls), numGroupCalls, useBlocking); } void TestBed::SetCollectiveArgs(ncclFunc_t const funcType, @@ -576,7 +576,7 @@ namespace RcclUnitTesting int k=0; for (int i = 0; i < numProcesses; i++) for (int j = 0; j < ntasks * ranksPerGpu; j++) { - result[i].push_back(k%numGpus); + result[i].push_back(ev.gpuPriorityOrder[k%numGpus]); k++; } return result; @@ -668,7 +668,7 @@ namespace RcclUnitTesting if(enableSweep == false && (numGpus < 8 || numRanks < 8)) { continue; } - this->InitComms(TestBed::GetDeviceIdsList(numChildren, numGpus, ranksPerGpu)); + this->InitComms(this->GetDeviceIdsList(numChildren, numGpus, ranksPerGpu)); if (testing::Test::HasFailure()) { isCorrect = false; diff --git a/test/common/TestBed.hpp b/test/common/TestBed.hpp index d74d10c04..45e6f655d 100644 --- a/test/common/TestBed.hpp +++ b/test/common/TestBed.hpp @@ -134,10 +134,10 @@ namespace RcclUnitTesting int const numGroupCalls); // Helper function that splits up GPUs to the given number of processes - static std::vector> GetDeviceIdsList(int const numProcesses, + std::vector> GetDeviceIdsList(int const numProcesses, int const numGpus, int const ranksPerGpu); - static std::vector> GetDeviceIdsList(int const numProcesses, + std::vector> GetDeviceIdsList(int const numProcesses, int const numGpus); // Generate a test case name