/************************************************************************* * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include #include #include "TestBed.hpp" #include "StandaloneUtils.hpp" namespace RcclUnitTesting { /** * \brief Verify that each device is assigned to the right rank using ncclCommSplit API. * ******************************************************************************************/ TEST(Standalone, SplitComms_RankCheck) { // Check for multi-gpu int numDevices; HIPCALL(hipGetDeviceCount(&numDevices)); if (numDevices < 2) { GTEST_SKIP() << "This test requires at least 2 devices."; } // Initialize the original comms std::vector comms(numDevices); NCCLCHECK(ncclCommInitAll(comms.data(), numDevices, nullptr)); // Split into new comms (round-robin) std::vector subComms(numDevices); int numSubComms = 2; std::map mapCounter; NCCLCHECK(ncclGroupStart()); for (int localRank = 0; localRank < numDevices; localRank++) { NCCLCHECK(ncclCommSplit(comms[localRank], localRank % numSubComms, localRank, &subComms[localRank], NULL)); mapCounter[localRank % numSubComms]++; } NCCLCHECK(ncclGroupEnd()); // Check that new comms have correct subranks / ranks for (int i = 0; i < numDevices; i++) { int subCommRank, subCommNRank; NCCLCHECK(ncclCommUserRank(subComms[i], &subCommRank)); NCCLCHECK(ncclCommCount(subComms[i], &subCommNRank)); ASSERT_EQ(subCommRank, i / numSubComms); ASSERT_EQ(subCommNRank, mapCounter[i % numSubComms]); } // Clean up comms for (auto& subComm : subComms) NCCLCHECK(ncclCommDestroy(subComm)); for (auto& comm : comms) NCCLCHECK(ncclCommDestroy(comm)); } /** * \brief Creates a communicator for each device and gathers them all in one rank. * ******************************************************************************************/ TEST(Standalone, SplitComms_OneColor) { // Check for multi-gpu int numDevices; HIPCALL(hipGetDeviceCount(&numDevices)); if (numDevices < 2) { GTEST_SKIP() << "This test requires at least 2 devices."; } // Initialize the original comms std::vector comms(numDevices); NCCLCHECK(ncclCommInitAll(comms.data(), numDevices, nullptr)); // Split into new comms (all of the same color) std::vector subComms(numDevices); NCCLCHECK(ncclGroupStart()); for (int localRank = 0; localRank < numDevices; localRank++) NCCLCHECK(ncclCommSplit(comms[localRank], 0, localRank, &subComms[localRank], NULL)); NCCLCHECK(ncclGroupEnd()); // Validate results for (int i = 0; i < numDevices; i++) { int originalRank, originalNRank; NCCLCHECK(ncclCommUserRank(comms[i], &originalRank)); NCCLCHECK(ncclCommCount(comms[i], &originalNRank)); int subCommRank, subCommNRank; NCCLCHECK(ncclCommUserRank(subComms[i], &subCommRank)); NCCLCHECK(ncclCommCount(subComms[i], &subCommNRank)); ASSERT_EQ(originalRank, subCommRank); ASSERT_EQ(originalNRank, subCommNRank); } // Clean up comms for (auto& subComm : subComms) NCCLCHECK(ncclCommDestroy(subComm)); for (auto& comm : comms) NCCLCHECK(ncclCommDestroy(comm)); } /** * \brief Creates a communicator for each device and reduces them into (numDevices / 2) ranks. * ******************************************************************************************/ TEST(Standalone, SplitComms_Reduce) { // Check for multi-gpu int numDevices; HIPCALL(hipGetDeviceCount(&numDevices)); if (numDevices < 2) { GTEST_SKIP() << "This test requires at least 2 devices."; } // Initialize the original comms std::vector comms(numDevices); NCCLCHECK(ncclCommInitAll(comms.data(), numDevices, nullptr)); // Split into new comms int numReducedRanks = numDevices / 2; std::vector subComms(numDevices); NCCLCHECK(ncclGroupStart()); for (int localRank = 0; localRank < numDevices; localRank++) NCCLCHECK(ncclCommSplit(comms[localRank], localRank < numReducedRanks ? 0 : NCCL_SPLIT_NOCOLOR, localRank, &subComms[localRank], NULL)); NCCLCHECK(ncclGroupEnd()); // Validate results for (int i = 0; i < numDevices; i++) { int originalRank, originalNRank; NCCLCHECK(ncclCommUserRank(comms[i], &originalRank)); NCCLCHECK(ncclCommCount(comms[i], &originalNRank)); if (i < numReducedRanks) { int subCommRank, subCommNRank; NCCLCHECK(ncclCommUserRank(subComms[i], &subCommRank)); NCCLCHECK(ncclCommCount(subComms[i], &subCommNRank)); ASSERT_EQ(originalRank, subCommRank); ASSERT_EQ(subCommNRank, numReducedRanks); } else { ASSERT_EQ(subComms[i], nullptr); } } // Cleanup comms for (auto& subComm : subComms) NCCLCHECK(ncclCommDestroy(subComm)); for (auto& comm : comms) NCCLCHECK(ncclCommDestroy(comm)); } /** * \brief Verify there is no regression in timing for each protocol [LL, LL128, Simple] * ******************************************************************************************/ TEST(Standalone, RegressionTiming) { // timing using namespace std::chrono; using Clock = std::chrono::high_resolution_clock; int usElapsed, numIterations = 20, numWarmups = 5; // Check for 2 GPUs int numGpus; HIPCALL(hipGetDeviceCount(&numGpus)); if (numGpus < 2) { GTEST_SKIP() << "This test requires at least 2 devices."; } hipDeviceProp_t devProp; HIPCALL(hipGetDeviceProperties(&devProp, 0)); // Initialize RCCL constexpr int numRanks = 2; std::vector comms(numRanks); std::vector gpuInput(numRanks); std::vector gpuOutput(numRanks); std::vector stream(numRanks); char *proto = std::getenv("NCCL_PROTO"); const char* protocolList[3] = {"LL", "LL128", "Simple"}; for (auto p : protocolList) { usElapsed = 0; if(strncmp("gfx12",devProp.gcnArchName,5) == 0) { setenv("NCCL_PROTO", "Simple", 1); } else { setenv("NCCL_PROTO", p, 1); } NCCLCHECK(ncclCommInitAll(comms.data(), numRanks, nullptr)); // Prepare CPU data arrays int N = 1250; std::vector cpuInput(N); std::vector cpuExpected(N); for (int i = 0; i < N; i++) { cpuInput[i] = i; cpuExpected[i] = 2 * i; } // Prepare GPU data arrays for (int rank = 0; rank < numRanks; rank++) { HIPCALL(hipSetDevice(rank)); HIPCALL(hipStreamCreate(&stream[rank])); HIPCALL(hipMalloc((void**)&gpuInput[rank], N * sizeof(int))); HIPCALL(hipMalloc((void**)&gpuOutput[rank], N * sizeof(int))); HIPCALL(hipMemcpy(gpuInput[rank], cpuInput.data(), N * sizeof(int), hipMemcpyHostToDevice)); HIPCALL(hipMemset(gpuOutput[rank], 0, N * sizeof(int))); HIPCALL(hipDeviceSynchronize()); } for (int iter = -numWarmups; iter < numIterations; iter++) { for (int rank = 0; rank < numRanks; rank++) { HIPCALL(hipSetDevice(rank)); HIPCALL(hipMemset(gpuOutput[rank], 0, N * sizeof(int))); HIPCALL(hipDeviceSynchronize()); } // Initiate the allreduce NCCLCHECK(ncclGroupStart()); for (int rank = 0; rank < numRanks; rank++) NCCLCHECK(ncclAllReduce(gpuInput[rank], gpuOutput[rank], N, ncclInt, ncclSum, comms[rank], stream[rank])); ncclResult_t res = ncclGroupEnd(); if (res != ncclSuccess) continue; const auto start = Clock::now(); // Wait for completion for (int rank = 0; rank < numRanks; rank++) { HIPCALL(hipStreamSynchronize(stream[rank])); } if (iter >= 0) usElapsed += duration_cast(Clock::now() - start).count(); // Check results std::vector cpuOutput(N); for (int rank = 0; rank < numRanks; rank++) { HIPCALL(hipMemcpy(cpuOutput.data(), gpuOutput[rank], N * sizeof(int), hipMemcpyDeviceToHost)); HIPCALL(hipDeviceSynchronize()); for (int i = 0; i < N; i++) ASSERT_EQ(cpuOutput[i], cpuExpected[i]); } } EXPECT_LT(usElapsed/(double)numIterations, 5000); printf("[ INFO ] protocol: %s, average runtime: %f microseconds\n", p, usElapsed/(double)numIterations); // Release resources for (int rank = 0; rank < numRanks; rank++){ HIPCALL(hipFree(gpuInput[rank])); HIPCALL(hipFree(gpuOutput[rank])); HIPCALL(hipStreamDestroy(stream[rank])); NCCLCHECK(ncclCommDestroy(comms[rank])); } } if (proto) setenv("NCCL_PROTO", proto, 1); else unsetenv("NCCL_PROTO"); } /** * \brief Verify rccl generic kernel stack size for each gfx architecture is less than the * expected MAX_STACK_SIZE. * ******************************************************************************************/ TEST(Standalone, StackSize) { const char* mainKernel = "ncclDevKernel"; // Look for the .co files std::vector coFileList = splitString(executeCommand("find ../ -type f -name \"librccl*.co\""), '\n'); // Check if the .co files exist in the build directory if (coFileList.empty()) GTEST_SKIP() << "Skipping... Could not found required files in the build directory."; for (const auto& file : coFileList) { // Store the output in a list std::string cmd = std::string(ROCM_PATH) + "/llvm/bin/llvm-readelf --notes " + file; std::vector metadata = splitString(executeCommand(cmd.c_str()), '\n'); // Skip if llvm is not installed if (metadata.empty()) GTEST_SKIP() << "Skipping... llvm is not found."; // Parse metadata from file and store it for each arch ArchInfo archInfo = parseMetadata(metadata); // iterate over each archs kernels for (const auto& kernel : archInfo.kernels) { if (kernel.name.find(mainKernel) != std::string::npos) { // Kernel stack size should be less than or equal to the maxStackSize value printf("[ INFO ] Arch: %s Kernel: %s Size: %d\n", archInfo.archName.c_str(), kernel.name.c_str(), kernel.privateSegmentFixedSize); EXPECT_LE(kernel.privateSegmentFixedSize, archInfo.archName == "gfx90a" ? MAX_STACK_SIZE_gfx90a : MAX_STACK_SIZE); } } } } /** * \brief Verify the device associated with communicator in both single and multi-device scenarios * ******************************************************************************************/ TEST(Standalone, CommCuDevice_Check) { int numDevices; HIPCALL(hipGetDeviceCount(&numDevices)); if (numDevices < 1) { GTEST_SKIP() << "No devices available."; } // Test single comm initialization ncclComm_t comm; ncclUniqueId id; NCCLCHECK(ncclGetUniqueId(&id)); HIPCALL(hipSetDevice(0)); NCCLCHECK(ncclCommInitRank(&comm, 1, id, 0)); // Verify device assignment int device; NCCLCHECK(ncclCommCuDevice(comm, &device)); ASSERT_EQ(device, 0); NCCLCHECK(ncclCommDestroy(comm)); // Test multi-device scenario if available if (numDevices > 1) { std::vector comms(numDevices); // Initialize all communicators at once NCCLCHECK(ncclCommInitAll(comms.data(), numDevices, nullptr)); // Verify device assignments for (int i = 0; i < numDevices; i++) { int assignedDevice; NCCLCHECK(ncclCommCuDevice(comms[i], &assignedDevice)); ASSERT_EQ(assignedDevice, i); } // Clean up for (int i = 0; i < numDevices; i++) { NCCLCHECK(ncclCommDestroy(comms[i])); } } } /** * \brief verifies that ncclCommUserRank correctly fails when provided with an invalid (null) communicator handle * ******************************************************************************************/ TEST(Standalone, SplitComms_RankCheck_Basic_Failure) { // Check for multi-gpu int numDevices; HIPCALL(hipGetDeviceCount(&numDevices)); if (numDevices < 2) { GTEST_SKIP() << "This test requires at least 2 devices."; } // Initialize the original comms std::vector comms(numDevices); NCCLCHECK(ncclCommInitAll(comms.data(), numDevices, nullptr)); // Create an invalid comm handle that will cause a failure ncclComm_t invalidComm = nullptr; // This NCCL_CHECK will fail because we're trying to query rank from a null communicator int rank; NCCLCHECK(ncclCommUserRank(invalidComm, &rank)); // Clean up comms for (auto& comm : comms) NCCLCHECK(ncclCommDestroy(comm)); } }