From ea1f02149626b96209bccd08ed00937b9575b66b Mon Sep 17 00:00:00 2001 From: Deeksha Goplani Date: Sat, 10 Jan 2026 04:34:01 +0530 Subject: [PATCH] Added new unit test for register.cc (#1712) * new unit test for register.cc Signed-off-by: Deeksha Goplani * Add new register API tests * Fix debug message ordering issue --------- Signed-off-by: Deeksha Goplani Co-authored-by: Atul Kulkarni [ROCm/rccl commit: 420b3b840e0324ea897db7f04028471a4ea830d7] --- projects/rccl/test/CMakeLists.txt | 2 + projects/rccl/test/RegisterTests.cpp | 247 ++++++++++++++++++ .../test/common/ProcessIsolatedTestRunner.cpp | 12 +- 3 files changed, 259 insertions(+), 2 deletions(-) create mode 100644 projects/rccl/test/RegisterTests.cpp diff --git a/projects/rccl/test/CMakeLists.txt b/projects/rccl/test/CMakeLists.txt index cf89bc28c4..f3a5e960e6 100644 --- a/projects/rccl/test/CMakeLists.txt +++ b/projects/rccl/test/CMakeLists.txt @@ -153,6 +153,7 @@ if(BUILD_TESTS) NonBlockingTests.cpp ReduceScatterTests.cpp ReduceTests.cpp + RegisterTests.cpp ScatterTests.cpp SendRecvTests.cpp StandaloneTests.cpp @@ -163,6 +164,7 @@ if(BUILD_TESTS) common/EnvVars.cpp common/PrepDataFuncs.cpp common/PtrUnion.cpp + common/ProcessIsolatedTestRunner.cpp common/TestBed.cpp common/TestBedChild.cpp common/StandaloneUtils.cpp diff --git a/projects/rccl/test/RegisterTests.cpp b/projects/rccl/test/RegisterTests.cpp new file mode 100644 index 0000000000..9040d0f2cd --- /dev/null +++ b/projects/rccl/test/RegisterTests.cpp @@ -0,0 +1,247 @@ +/************************************************************************* + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include +#include +#include + +#include "common/ErrCode.hpp" +#include "common/ProcessIsolatedTestRunner.hpp" +#include "StandaloneUtils.hpp" + +namespace RcclUnitTesting +{ + +// Helper to check GPU availability +static bool hasGpuAvailable() { + int numDevices = 0; + hipError_t err = hipGetDeviceCount(&numDevices); + return (err == hipSuccess && numDevices >= 1); +} + +// Macro to skip test if no GPU is available +#define SKIP_IF_NO_GPU() \ + do { \ + if (!hasGpuAvailable()) { \ + GTEST_SKIP() << "This test requires at least 1 GPU device."; \ + return; \ + } \ + } while(0) + +// Helper to initialize a single-rank communicator +static ncclResult_t initSingleRankComm(ncclComm_t* comm) { + ncclUniqueId id; + ncclResult_t res = ncclGetUniqueId(&id); + if (res != ncclSuccess) return res; + return ncclCommInitRank(comm, 1, id, 0); +} + +//============================================================================== +// Test implementation functions - parameterized by registration expectation +//============================================================================== + +/** + * @brief Test basic register/deregister of a single buffer + * @param expectNonNull If true, expect non-NULL handle (registration enabled) + */ +static void testCommRegisterDeregister(bool expectNonNull) { + SKIP_IF_NO_GPU(); + + HIPCALL(hipSetDevice(0)); + + ncclComm_t comm; + NCCLCHECK(initSingleRankComm(&comm)); + + // Create buffer on device + const size_t bufferSize = 1024 * 1024; // 1 MB + void* deviceBuffer = nullptr; + HIPCALL(hipMalloc(&deviceBuffer, bufferSize)); + ASSERT_NE(deviceBuffer, nullptr) << "Failed to allocate device buffer"; + + // Register buffer with ncclCommRegister + void* regHandle = nullptr; + NCCLCHECK(ncclCommRegister(comm, deviceBuffer, bufferSize, ®Handle)); + + // Verify handle based on expected behavior + if (expectNonNull) { + EXPECT_NE(regHandle, nullptr) + << "Buffer registration failed: regHandle is NULL even though NCCL_LOCAL_REGISTER=1"; + } else { + EXPECT_EQ(regHandle, nullptr) + << "Expected NULL handle when NCCL_LOCAL_REGISTER is disabled"; + } + + // Deregister and clean up + NCCLCHECK(ncclCommDeregister(comm, regHandle)); + HIPCALL(hipFree(deviceBuffer)); + NCCLCHECK(ncclCommDestroy(comm)); +} + +/** + * @brief Test registering multiple buffers simultaneously + * @param expectNonNull If true, expect non-NULL handles and verify uniqueness + */ +static void testMultipleBufferRegistration(bool expectNonNull) { + SKIP_IF_NO_GPU(); + + HIPCALL(hipSetDevice(0)); + + ncclComm_t comm; + NCCLCHECK(initSingleRankComm(&comm)); + + // Create and register multiple buffers + const int numBuffers = 4; + const size_t bufferSize = 64 * 1024; // 64 KB each + void* deviceBuffers[numBuffers] = {nullptr}; + void* regHandles[numBuffers] = {nullptr}; + + for (int i = 0; i < numBuffers; i++) { + HIPCALL(hipMalloc(&deviceBuffers[i], bufferSize)); + ASSERT_NE(deviceBuffers[i], nullptr) << "Failed to allocate buffer " << i; + + NCCLCHECK(ncclCommRegister(comm, deviceBuffers[i], bufferSize, ®Handles[i])); + + if (expectNonNull) { + EXPECT_NE(regHandles[i], nullptr) << "Registration failed for buffer " << i; + } else { + EXPECT_EQ(regHandles[i], nullptr) << "Expected NULL handle for buffer " << i; + } + } + + // Verify all handles are unique (only when registration is enabled) + if (expectNonNull) { + for (int i = 0; i < numBuffers; i++) { + for (int j = i + 1; j < numBuffers; j++) { + if (regHandles[i] != nullptr && regHandles[j] != nullptr) { + EXPECT_NE(regHandles[i], regHandles[j]) + << "Buffers " << i << " and " << j << " have the same registration handle"; + } + } + } + } + + // Deregister and clean up + for (int i = 0; i < numBuffers; i++) { + NCCLCHECK(ncclCommDeregister(comm, regHandles[i])); + HIPCALL(hipFree(deviceBuffers[i])); + } + NCCLCHECK(ncclCommDestroy(comm)); +} + +/** + * @brief Test registering buffers of various sizes + * @param expectNonNull If true, expect non-NULL handles for all sizes + */ +static void testVariableSizeBuffers(bool expectNonNull) { + SKIP_IF_NO_GPU(); + + HIPCALL(hipSetDevice(0)); + + ncclComm_t comm; + NCCLCHECK(initSingleRankComm(&comm)); + + // Test various buffer sizes: 4KB, 64KB, 1MB, 4MB + const size_t sizes[] = {4096, 64 * 1024, 1024 * 1024, 4 * 1024 * 1024}; + const int numSizes = sizeof(sizes) / sizeof(sizes[0]); + + for (int i = 0; i < numSizes; i++) { + void* deviceBuffer = nullptr; + void* regHandle = nullptr; + + HIPCALL(hipMalloc(&deviceBuffer, sizes[i])); + ASSERT_NE(deviceBuffer, nullptr) << "Failed to allocate buffer of size " << sizes[i]; + + NCCLCHECK(ncclCommRegister(comm, deviceBuffer, sizes[i], ®Handle)); + + if (expectNonNull) { + EXPECT_NE(regHandle, nullptr) + << "Registration failed for buffer size " << sizes[i] << " bytes"; + } else { + EXPECT_EQ(regHandle, nullptr) + << "Expected NULL handle for buffer size " << sizes[i] << " bytes"; + } + + NCCLCHECK(ncclCommDeregister(comm, regHandle)); + HIPCALL(hipFree(deviceBuffer)); + } + + NCCLCHECK(ncclCommDestroy(comm)); +} + +/** + * @brief Test deregistering NULL handle (should succeed as no-op) + */ +static void testDeregisterNullHandle() { + SKIP_IF_NO_GPU(); + + HIPCALL(hipSetDevice(0)); + + ncclComm_t comm; + NCCLCHECK(initSingleRankComm(&comm)); + + // Deregister NULL handle - should be a no-op + NCCLCHECK(ncclCommDeregister(comm, nullptr)); + + NCCLCHECK(ncclCommDestroy(comm)); +} + +//============================================================================== +// Test configuration helpers +//============================================================================== + +// Environment configuration for disabled registration (default) +static ProcessIsolatedTestRunner::TestConfig +makeDisabledConfig(const std::string& name, std::function testFn) { + return ProcessIsolatedTestRunner::TestConfig(name, testFn) + .clearVariable("NCCL_LOCAL_REGISTER"); +} + +// Environment configuration for enabled registration +static ProcessIsolatedTestRunner::TestConfig +makeEnabledConfig(const std::string& name, std::function testFn) { + return ProcessIsolatedTestRunner::TestConfig(name, testFn) + .withEnvironment({{"NCCL_LOCAL_REGISTER", "1"}}); +} + +/** + * @brief Test ncclCommRegister and ncclCommDeregister APIs with process isolation + * + * This test suite verifies that: + * 1. A device buffer can be registered with ncclCommRegister (API returns success) + * 2. When NCCL_LOCAL_REGISTER=1, the registration returns a valid (non-NULL) handle + * 3. When NCCL_LOCAL_REGISTER is not set, NULL handle is expected (default behavior) + * 4. The buffer can be deregistered with ncclCommDeregister + * + * Note: NCCL_LOCAL_REGISTER defaults to 0 (disabled) in RCCL. + */ +TEST(Register, ProcessIsolatedRegisterTests) +{ + RUN_ISOLATED_TESTS( + // CommRegisterDeregister tests + makeDisabledConfig("CommRegisterDeregister_Disabled", + []() { testCommRegisterDeregister(false); }), + makeEnabledConfig("CommRegisterDeregister_Enabled", + []() { testCommRegisterDeregister(true); }), + + // MultipleBufferRegistration tests + makeDisabledConfig("MultipleBufferRegistration_Disabled", + []() { testMultipleBufferRegistration(false); }), + makeEnabledConfig("MultipleBufferRegistration_Enabled", + []() { testMultipleBufferRegistration(true); }), + + // VariableSizeBuffers tests + makeDisabledConfig("VariableSizeBuffers_Disabled", + []() { testVariableSizeBuffers(false); }), + makeEnabledConfig("VariableSizeBuffers_Enabled", + []() { testVariableSizeBuffers(true); }), + + // DeregisterNullHandle test (no enable/disable variants needed) + ProcessIsolatedTestRunner::TestConfig("DeregisterNullHandle", testDeregisterNullHandle) + ); +} + +} // namespace RcclUnitTesting diff --git a/projects/rccl/test/common/ProcessIsolatedTestRunner.cpp b/projects/rccl/test/common/ProcessIsolatedTestRunner.cpp index beb3853c11..82d7c9cdec 100644 --- a/projects/rccl/test/common/ProcessIsolatedTestRunner.cpp +++ b/projects/rccl/test/common/ProcessIsolatedTestRunner.cpp @@ -402,6 +402,9 @@ bool ProcessIsolatedTestRunner::executeAllTests(const ExecutionOptions& options) continue; } + // Flush all output before fork to prevent child from inheriting unflushed buffers + fflush(NULL); + pid_t pid = fork(); if(pid == 0) @@ -435,6 +438,10 @@ bool ProcessIsolatedTestRunner::executeAllTests(const ExecutionOptions& options) { INFO("Running isolated test '%s' (PID: %d)\n", testConfig.name.c_str(), pid); } + // Flush parent's output before reading from child pipes to ensure proper ordering + fflush(stdout); + fflush(stderr); + int status; CapturedOutput output = captureProcessOutput(stdout_fd, stderr_fd, pid, &status); @@ -442,6 +449,9 @@ bool ProcessIsolatedTestRunner::executeAllTests(const ExecutionOptions& options) auto duration = std::chrono::duration_cast(endTime - startTime); + // Display captured output BEFORE status messages for proper sequencing + displayCapturedOutput(output, testConfig.name); + TestResult testResult; testResult.testName = testConfig.name; testResult.processId = pid; @@ -530,8 +540,6 @@ bool ProcessIsolatedTestRunner::executeAllTests(const ExecutionOptions& options) testResult.errorMessage = "Failed to wait for process"; } - displayCapturedOutput(output, testConfig.name); - recordTestResult(testResult); // Stop on first failure if requested