From 44064a612ca475233b89c6202f742ffed5c2fff7 Mon Sep 17 00:00:00 2001 From: saurabhAMD Date: Wed, 12 Jun 2024 12:04:58 -0500 Subject: [PATCH] enable UT to test with channels greater than 64 [ROCm/rccl commit: 392a73fdef94c9983d9cbb89a1bc4a54dda74703] --- projects/rccl/test/AllReduceTests.cpp | 24 +++++++++++++++ projects/rccl/test/AllToAllTests.cpp | 24 +++++++++++++++ projects/rccl/test/common/EnvVars.cpp | 44 +++++++++++++++++++++++++++ projects/rccl/test/common/EnvVars.hpp | 1 + 4 files changed, 93 insertions(+) diff --git a/projects/rccl/test/AllReduceTests.cpp b/projects/rccl/test/AllReduceTests.cpp index d0ef4c6188..23f885ca58 100644 --- a/projects/rccl/test/AllReduceTests.cpp +++ b/projects/rccl/test/AllReduceTests.cpp @@ -102,6 +102,30 @@ namespace RcclUnitTesting testBed.Finalize(); } + TEST(AllReduce, Channels) + { + TestBed testBed; + if(testBed.ev.isGfx94) { + // Configuration + std::vector const funcTypes = {ncclCollAllReduce}; + std::vector const dataTypes = {ncclBfloat16, ncclHalf}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {64 * 1024 * 1024, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; + std::vector const channelList = {"56", "84", "112"}; + for (auto channel : channelList) { + setenv("NCCL_MIN_NCHANNELS", channel, 1); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); + testBed.Finalize(); + unsetenv("NCCL_MIN_NCHANNELS"); + } + } + } + TEST(AllReduce, ManagedMemGraph) { TestBed testBed; diff --git a/projects/rccl/test/AllToAllTests.cpp b/projects/rccl/test/AllToAllTests.cpp index 9ceaa94c3c..07e88e751d 100644 --- a/projects/rccl/test/AllToAllTests.cpp +++ b/projects/rccl/test/AllToAllTests.cpp @@ -85,4 +85,28 @@ namespace RcclUnitTesting inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } + + TEST(AllToAll, Channels) + { + TestBed testBed; + if(testBed.ev.isGfx94) { + // Configuration + std::vector const funcTypes = {ncclCollAllToAll}; + std::vector const dataTypes = {ncclBfloat16, ncclHalf}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {64 * 1024 * 1024, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; + std::vector const channelList = {"56", "84", "112"}; + for (auto channel : channelList) { + setenv("NCCL_MIN_NCHANNELS", channel, 1); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList); + testBed.Finalize(); + unsetenv("NCCL_MIN_NCHANNELS"); + } + } + } } diff --git a/projects/rccl/test/common/EnvVars.cpp b/projects/rccl/test/common/EnvVars.cpp index db161b8160..d8cc5e8bef 100644 --- a/projects/rccl/test/common/EnvVars.cpp +++ b/projects/rccl/test/common/EnvVars.cpp @@ -15,6 +15,48 @@ namespace RcclUnitTesting int const UT_SINGLE_PROCESS = (1<<0); int const UT_MULTI_PROCESS = (1<<1); + int getArchInfo(bool *isRightArch) + { + // Prepare parent->child pipe + int pipefd[2]; + if (pipe(pipefd) == -1) { + ERROR("Unable to create parent->child pipe for getting number of devices\n"); + return TEST_FAIL; + } + pid_t pid = fork(); + if (0 == pid) { + bool isGfx94 = false; + int dev; + hipGetDeviceCount(&dev); + for (int deviceId = 0; deviceId < dev; deviceId++) { + char gcn[256]; + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, deviceId); + char *gcnArchNameToken = strtok(devProp.gcnArchName, ":"); + strcpy(gcn, gcnArchNameToken); + if(std::strncmp("gfx94", gcn, 5) == 0) { + isGfx94 = true; + } else { + isGfx94 = false; + break; + } + } + if (write(pipefd[1], &isGfx94, sizeof(isGfx94)) != sizeof(isGfx94)) return TEST_FAIL; + close(pipefd[0]); + close(pipefd[1]); + exit(EXIT_SUCCESS); + } + else { + int status; + if (read(pipefd[0], isRightArch, sizeof(*isRightArch)) != sizeof(*isRightArch)) return TEST_FAIL; + waitpid(pid, &status, 0); + assert(!status); + close(pipefd[0]); + close(pipefd[1]); + } + return TEST_SUCCESS; + } + int getDeviceCount(int *devices) { // Prepare parent->child pipe @@ -52,6 +94,8 @@ namespace RcclUnitTesting // NOTE: Cannot use HIP call prior to launching unless it is inside another child process numDetectedGpus = 0; getDeviceCount(&numDetectedGpus); + isGfx94 = false; + getArchInfo(&isGfx94); showNames = GetEnvVar("UT_SHOW_NAMES" , 1); minGpus = GetEnvVar("UT_MIN_GPUS" , 2); diff --git a/projects/rccl/test/common/EnvVars.hpp b/projects/rccl/test/common/EnvVars.hpp index 1d21ab210c..bf54611fc0 100644 --- a/projects/rccl/test/common/EnvVars.hpp +++ b/projects/rccl/test/common/EnvVars.hpp @@ -29,6 +29,7 @@ namespace RcclUnitTesting bool showTiming; // Show timing per case at end [UT_SHOW_TIMING] bool useInteractive; // Run in interactive mode [UT_INTERACTIVE] int timeoutUs; // Set timeout for child in microseconds [UT_TIMEOUT_US] + bool isGfx94; // Detects if architecture is gfx94 // Constructor that parses and collects environment variables EnvVars();