diff --git a/projects/rccl/tools/TransferBench/EnvVars.hpp b/projects/rccl/tools/TransferBench/EnvVars.hpp index 2e430c9f65..1020fae2d2 100644 --- a/projects/rccl/tools/TransferBench/EnvVars.hpp +++ b/projects/rccl/tools/TransferBench/EnvVars.hpp @@ -40,7 +40,7 @@ public: useHipCall = GetEnvVar("USE_HIP_CALL" , 0); useMemset = GetEnvVar("USE_MEMSET" , 0); - useSingleSync = GetEnvVar("USE_SINGLE_SYNC" , 0); + useSingleSync = GetEnvVar("USE_SINGLE_SYNC" , 1); useInteractive = GetEnvVar("USE_INTERACTIVE" , 0); combineTiming = GetEnvVar("COMBINE_TIMING" , 0); showAddr = GetEnvVar("SHOW_ADDR" , 0); diff --git a/projects/rccl/tools/TransferBench/TransferBench.cpp b/projects/rccl/tools/TransferBench/TransferBench.cpp index ba7bfae5dc..8c07732df8 100644 --- a/projects/rccl/tools/TransferBench/TransferBench.cpp +++ b/projects/rccl/tools/TransferBench/TransferBench.cpp @@ -50,24 +50,8 @@ int main(int argc, char **argv) exit(0); } - // Check that Link configuration file can be opened - FILE* fp = fopen(argv[1], "r"); - if (!fp) - { - printf("[ERROR] Unable to open link configuration file: [%s]\n", argv[1]); - exit(1); - } - - // Check for NUMA library support - if (numa_available() == -1) - { - printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n"); - exit(1); - } - // Collect environment variables / display current run configuration EnvVars ev; - ev.DisplayEnvVars(); // Determine number of bytes to run per Link // If a non-zero number of bytes is specified, use it @@ -92,6 +76,30 @@ int main(int argc, char **argv) for (auto N : valuesOfN) maxN = std::max(maxN, N); + // Execute only peer to peer benchmark mode, similar to rocm-bandwidth-test + if (!strcmp(argv[1], "p2p")) + { + // Execute peer to peer benchmark mode + RunPeerToPeerBenchmarks(ev, numBytesPerLink / sizeof(float)); + exit(0); + } + + // Check that Link configuration file can be opened + FILE* fp = fopen(argv[1], "r"); + if (!fp) + { + printf("[ERROR] Unable to open link configuration file: [%s]\n", argv[1]); + exit(1); + } + + // Check for NUMA library support + if (numa_available() == -1) + { + printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n"); + exit(1); + } + ev.DisplayEnvVars(); + int const initOffset = ev.byteOffset / sizeof(float); std::stack threads; @@ -381,6 +389,7 @@ void DisplayUsage(char const* cmdName) printf("Usage: %s configFile \n", cmdName); printf(" configFile: File containing Links to execute (see below for format)\n"); + printf(" Specifying \"p2p\" as the configFile will execute a peer to peer benchmark\n"); printf(" N : (Optional) Number of bytes to transfer per link.\n"); printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", DEFAULT_BYTES_PER_LINK); printf(" If 0 is specified, a range of Ns will be benchmarked\n"); @@ -880,8 +889,8 @@ void CheckPages(char* array, size_t numBytes, int targetId) if (mistakeCount > 0) { printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId); - // NOTE: Some older versions of HIP do not properly respect NUMA policy so avoid failing for now - // exit(1); + printf("[ERROR] Ensure up-to-date ROCm is installed\n"); + exit(1); } } @@ -1086,3 +1095,173 @@ void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link) link.totalTime += (std::chrono::duration_cast>(cpuDelta).count() * 1000.0); } } + +void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) +{ + // Collect the number of available CPUs/GPUs on this machine + int numGpus; + HIP_CALL(hipGetDeviceCount(&numGpus)); + int const numCpus = numa_num_configured_nodes(); + int const numDevices = numCpus + numGpus; + + // Enable peer to peer for each GPU + for (int i = 0; i < numGpus; i++) + for (int j = 0; j < numGpus; j++) + if (i != j) EnablePeerAccess(i, j); + + printf("Performing copies in each direction of %lu bytes\n", N * sizeof(float)); + printf("Using %d threads per NUMA node for CPU copies\n", ev.numCpuPerLink); + + // Perform unidirectional / bidirectional + for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++) + { + // Print header + printf("%sdirectional copy peak bandwidth GB/s\n", isBidirectional ? "Bi" : "Uni"); + printf("%10s", "D/D"); + for (int i = 0; i < numCpus; i++) + printf("%7s %02d", "CPU", i); + for (int i = 0; i < numGpus; i++) + printf("%7s %02d", "GPU", i); + printf("\n"); + + // Loop over all possible src/dst pairs + for (int src = 0; src < numDevices; src++) + { + MemType const& srcMemType = (src < numCpus ? MEM_CPU : MEM_GPU); + int srcIndex = (srcMemType == MEM_CPU ? src : src - numCpus); + printf("%7s %02d", (srcMemType == MEM_CPU) ? "CPU" : "GPU", srcIndex); + + for (int dst = 0; dst < numDevices; dst++) + { + MemType const& dstMemType = (dst < numCpus ? MEM_CPU : MEM_GPU); + int dstIndex = (dstMemType == MEM_CPU ? dst : dst - numCpus); + + double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcMemType, srcIndex, dstMemType, dstIndex); + if (bandwidth == 0) + printf("%10s", "N/A"); + else + printf("%10.2f", bandwidth); + fflush(stdout); + } + printf("\n"); + } + printf("\n"); + } +} + +double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional, + MemType srcMemType, int srcIndex, + MemType dstMemType, int dstIndex) +{ + Link links[2]; + int const initOffset = ev.byteOffset / sizeof(float); + + // Skip bidirectional on same device + if (isBidirectional && srcMemType == dstMemType && srcIndex == dstIndex) return 0.0f; + + // Prepare Links + links[0].srcMemType = links[0].exeMemType = links[1].dstMemType = srcMemType; + links[0].srcIndex = links[0].exeIndex = links[1].dstIndex = srcIndex; + links[0].dstMemType = links[1].exeMemType = links[1].srcMemType = dstMemType; + links[0].dstIndex = links[1].exeIndex = links[1].srcIndex = dstIndex; + for (int i = 0; i <= isBidirectional; i++) + { + AllocateMemory(links[i].srcMemType, links[i].srcIndex, N * sizeof(float) + ev.byteOffset, &links[i].srcMem); + AllocateMemory(links[i].dstMemType, links[i].dstIndex, N * sizeof(float) + ev.byteOffset, &links[i].dstMem); + links[i].totalTime = 0.0; + + CheckOrFill(MODE_FILL, N, ev.useMemset, ev.useHipCall, ev.fillPattern, links[i].srcMem + initOffset); + if (links[i].exeMemType == MEM_GPU) + { + HIP_CALL(hipDeviceGetAttribute(&links[i].numBlocksToUse, hipDeviceAttributeMultiprocessorCount, links[i].exeIndex)); + HIP_CALL(hipSetDevice(links[i].exeIndex)); + HIP_CALL(hipEventCreate(&links[i].startEvent)); + HIP_CALL(hipEventCreate(&links[i].stopEvent)); + HIP_CALL(hipMalloc((void**)&links[i].blockParam, sizeof(BlockParam) * links[i].numBlocksToUse)); + HIP_CALL(hipStreamCreate(&links[i].stream)); + + size_t assigned = 0; + int maxNumBlocksToUse = std::min((N + 31) / 32, (size_t)links[i].numBlocksToUse); + for (int j = 0; j < links[i].numBlocksToUse; j++) + { + BlockParam param; + int blocksLeft = std::max(0, maxNumBlocksToUse - j); + size_t leftover = N - assigned; + size_t roundedN = (leftover + 31) / 32; + param.N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * 32)) : 0; + param.src = links[i].srcMem + assigned + initOffset; + param.dst = links[i].dstMem + assigned + initOffset; + assigned += param.N; + + HIP_CALL(hipMemcpy(&links[i].blockParam[j], ¶m, sizeof(BlockParam), hipMemcpyHostToDevice)); + } + } + else + { + links[i].blockParam = (BlockParam*)malloc(ev.numCpuPerLink * sizeof(BlockParam)); + // For CPU-based copy, divded based on the number of child threads + size_t assigned = 0; + int maxNumBlocksToUse = std::min((N + 31) / 32, (size_t)ev.numCpuPerLink); + for (int j = 0; j < ev.numCpuPerLink; j++) + { + int blocksLeft = std::max(0, maxNumBlocksToUse - j); + size_t leftover = N - assigned; + size_t roundedN = (leftover + 31) / 32; + links[i].blockParam[j].N = blocksLeft ? std::min(leftover, ((roundedN / blocksLeft) * 32)) : 0; + links[i].blockParam[j].src = links[i].srcMem + assigned + initOffset; + links[i].blockParam[j].dst = links[i].dstMem + assigned + initOffset; + assigned += links[i].blockParam[j].N; + } + } + } + + std::stack threads; + + // Perform iteration + for (int iteration = -ev.numWarmups; iteration < ev.numIterations; iteration++) + { + // Perform timed iterations + for (int i = 0; i <= isBidirectional; i++) + threads.push(std::thread(RunLink, std::ref(ev), N, iteration, std::ref(links[i]))); + + // Wait for all threads to finish + for (int i = 0; i <= isBidirectional; i++) + { + threads.top().join(); + threads.pop(); + } + } + + // Validate that each link has transferred correctly + for (int i = 0; i <= isBidirectional; i++) + CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, links[i].dstMem + initOffset); + + // Collect aggregate bandwidth + double totalBandwidth = 0; + for (int i = 0; i <= isBidirectional; i++) + { + double linkDurationMsec = links[i].totalTime / (1.0 * ev.numIterations); + double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f; + totalBandwidth += linkBandwidthGbs; + } + + // Release GPU memory + for (int i = 0; i <= isBidirectional; i++) + { + DeallocateMemory(links[i].srcMemType, links[i].srcIndex, links[i].srcMem); + DeallocateMemory(links[i].dstMemType, links[i].dstIndex, links[i].dstMem); + + if (links[i].exeMemType == MEM_GPU) + { + HIP_CALL(hipEventDestroy(links[i].startEvent)); + HIP_CALL(hipEventDestroy(links[i].stopEvent)); + HIP_CALL(hipStreamDestroy(links[i].stream)); + HIP_CALL(hipFree(links[i].blockParam)); + } + else if (links[i].exeMemType == MEM_CPU) + { + free(links[i].blockParam); + } + } + return totalBandwidth; +} diff --git a/projects/rccl/tools/TransferBench/TransferBench.hpp b/projects/rccl/tools/TransferBench/TransferBench.hpp index cb807002d2..a714160631 100644 --- a/projects/rccl/tools/TransferBench/TransferBench.hpp +++ b/projects/rccl/tools/TransferBench/TransferBench.hpp @@ -122,7 +122,10 @@ void DeallocateMemory(MemType memType, int devIndex, float* memPtr); void CheckPages(char* byteArray, size_t numBytes, int targetId); void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector const& fillPattern, float* ptr); void RunLink(EnvVars const& ev, size_t const N, int const iteration, Link& link); - +void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N); +double GetPeakBandwidth(EnvVars const& ev, size_t N, int isBidirectional, + MemType srcMemType, int srcIndex, + MemType dstMemType, int dstIndex); std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount); std::string GetDesc(MemType srcMemType, int srcIndex,