Added new unit test for register.cc (#1712)

* new unit test for register.cc

Signed-off-by: Deeksha Goplani <deeksha.goplani@amd.com>

* Add new register API tests

* Fix debug message ordering issue

---------

Signed-off-by: Deeksha Goplani <deeksha.goplani@amd.com>
Co-authored-by: Atul Kulkarni <atul.kulkarni@amd.com>

[ROCm/rccl commit: 420b3b840e]
This commit is contained in:
Deeksha Goplani
2026-01-10 04:34:01 +05:30
committato da GitHub
parent eb347a0dd3
commit ea1f021496
3 ha cambiato i file con 259 aggiunte e 2 eliminazioni
+2
Vedi File
@@ -153,6 +153,7 @@ if(BUILD_TESTS)
NonBlockingTests.cpp
ReduceScatterTests.cpp
ReduceTests.cpp
RegisterTests.cpp
ScatterTests.cpp
SendRecvTests.cpp
StandaloneTests.cpp
@@ -163,6 +164,7 @@ if(BUILD_TESTS)
common/EnvVars.cpp
common/PrepDataFuncs.cpp
common/PtrUnion.cpp
common/ProcessIsolatedTestRunner.cpp
common/TestBed.cpp
common/TestBedChild.cpp
common/StandaloneUtils.cpp
+247
Vedi File
@@ -0,0 +1,247 @@
/*************************************************************************
* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <gtest/gtest.h>
#include <rccl/rccl.h>
#include <cstdlib>
#include <cstdio>
#include "common/ErrCode.hpp"
#include "common/ProcessIsolatedTestRunner.hpp"
#include "StandaloneUtils.hpp"
namespace RcclUnitTesting
{
// Helper to check GPU availability
static bool hasGpuAvailable() {
int numDevices = 0;
hipError_t err = hipGetDeviceCount(&numDevices);
return (err == hipSuccess && numDevices >= 1);
}
// Macro to skip test if no GPU is available
#define SKIP_IF_NO_GPU() \
do { \
if (!hasGpuAvailable()) { \
GTEST_SKIP() << "This test requires at least 1 GPU device."; \
return; \
} \
} while(0)
// Helper to initialize a single-rank communicator
static ncclResult_t initSingleRankComm(ncclComm_t* comm) {
ncclUniqueId id;
ncclResult_t res = ncclGetUniqueId(&id);
if (res != ncclSuccess) return res;
return ncclCommInitRank(comm, 1, id, 0);
}
//==============================================================================
// Test implementation functions - parameterized by registration expectation
//==============================================================================
/**
* @brief Test basic register/deregister of a single buffer
* @param expectNonNull If true, expect non-NULL handle (registration enabled)
*/
static void testCommRegisterDeregister(bool expectNonNull) {
SKIP_IF_NO_GPU();
HIPCALL(hipSetDevice(0));
ncclComm_t comm;
NCCLCHECK(initSingleRankComm(&comm));
// Create buffer on device
const size_t bufferSize = 1024 * 1024; // 1 MB
void* deviceBuffer = nullptr;
HIPCALL(hipMalloc(&deviceBuffer, bufferSize));
ASSERT_NE(deviceBuffer, nullptr) << "Failed to allocate device buffer";
// Register buffer with ncclCommRegister
void* regHandle = nullptr;
NCCLCHECK(ncclCommRegister(comm, deviceBuffer, bufferSize, &regHandle));
// Verify handle based on expected behavior
if (expectNonNull) {
EXPECT_NE(regHandle, nullptr)
<< "Buffer registration failed: regHandle is NULL even though NCCL_LOCAL_REGISTER=1";
} else {
EXPECT_EQ(regHandle, nullptr)
<< "Expected NULL handle when NCCL_LOCAL_REGISTER is disabled";
}
// Deregister and clean up
NCCLCHECK(ncclCommDeregister(comm, regHandle));
HIPCALL(hipFree(deviceBuffer));
NCCLCHECK(ncclCommDestroy(comm));
}
/**
* @brief Test registering multiple buffers simultaneously
* @param expectNonNull If true, expect non-NULL handles and verify uniqueness
*/
static void testMultipleBufferRegistration(bool expectNonNull) {
SKIP_IF_NO_GPU();
HIPCALL(hipSetDevice(0));
ncclComm_t comm;
NCCLCHECK(initSingleRankComm(&comm));
// Create and register multiple buffers
const int numBuffers = 4;
const size_t bufferSize = 64 * 1024; // 64 KB each
void* deviceBuffers[numBuffers] = {nullptr};
void* regHandles[numBuffers] = {nullptr};
for (int i = 0; i < numBuffers; i++) {
HIPCALL(hipMalloc(&deviceBuffers[i], bufferSize));
ASSERT_NE(deviceBuffers[i], nullptr) << "Failed to allocate buffer " << i;
NCCLCHECK(ncclCommRegister(comm, deviceBuffers[i], bufferSize, &regHandles[i]));
if (expectNonNull) {
EXPECT_NE(regHandles[i], nullptr) << "Registration failed for buffer " << i;
} else {
EXPECT_EQ(regHandles[i], nullptr) << "Expected NULL handle for buffer " << i;
}
}
// Verify all handles are unique (only when registration is enabled)
if (expectNonNull) {
for (int i = 0; i < numBuffers; i++) {
for (int j = i + 1; j < numBuffers; j++) {
if (regHandles[i] != nullptr && regHandles[j] != nullptr) {
EXPECT_NE(regHandles[i], regHandles[j])
<< "Buffers " << i << " and " << j << " have the same registration handle";
}
}
}
}
// Deregister and clean up
for (int i = 0; i < numBuffers; i++) {
NCCLCHECK(ncclCommDeregister(comm, regHandles[i]));
HIPCALL(hipFree(deviceBuffers[i]));
}
NCCLCHECK(ncclCommDestroy(comm));
}
/**
* @brief Test registering buffers of various sizes
* @param expectNonNull If true, expect non-NULL handles for all sizes
*/
static void testVariableSizeBuffers(bool expectNonNull) {
SKIP_IF_NO_GPU();
HIPCALL(hipSetDevice(0));
ncclComm_t comm;
NCCLCHECK(initSingleRankComm(&comm));
// Test various buffer sizes: 4KB, 64KB, 1MB, 4MB
const size_t sizes[] = {4096, 64 * 1024, 1024 * 1024, 4 * 1024 * 1024};
const int numSizes = sizeof(sizes) / sizeof(sizes[0]);
for (int i = 0; i < numSizes; i++) {
void* deviceBuffer = nullptr;
void* regHandle = nullptr;
HIPCALL(hipMalloc(&deviceBuffer, sizes[i]));
ASSERT_NE(deviceBuffer, nullptr) << "Failed to allocate buffer of size " << sizes[i];
NCCLCHECK(ncclCommRegister(comm, deviceBuffer, sizes[i], &regHandle));
if (expectNonNull) {
EXPECT_NE(regHandle, nullptr)
<< "Registration failed for buffer size " << sizes[i] << " bytes";
} else {
EXPECT_EQ(regHandle, nullptr)
<< "Expected NULL handle for buffer size " << sizes[i] << " bytes";
}
NCCLCHECK(ncclCommDeregister(comm, regHandle));
HIPCALL(hipFree(deviceBuffer));
}
NCCLCHECK(ncclCommDestroy(comm));
}
/**
* @brief Test deregistering NULL handle (should succeed as no-op)
*/
static void testDeregisterNullHandle() {
SKIP_IF_NO_GPU();
HIPCALL(hipSetDevice(0));
ncclComm_t comm;
NCCLCHECK(initSingleRankComm(&comm));
// Deregister NULL handle - should be a no-op
NCCLCHECK(ncclCommDeregister(comm, nullptr));
NCCLCHECK(ncclCommDestroy(comm));
}
//==============================================================================
// Test configuration helpers
//==============================================================================
// Environment configuration for disabled registration (default)
static ProcessIsolatedTestRunner::TestConfig
makeDisabledConfig(const std::string& name, std::function<void()> testFn) {
return ProcessIsolatedTestRunner::TestConfig(name, testFn)
.clearVariable("NCCL_LOCAL_REGISTER");
}
// Environment configuration for enabled registration
static ProcessIsolatedTestRunner::TestConfig
makeEnabledConfig(const std::string& name, std::function<void()> testFn) {
return ProcessIsolatedTestRunner::TestConfig(name, testFn)
.withEnvironment({{"NCCL_LOCAL_REGISTER", "1"}});
}
/**
* @brief Test ncclCommRegister and ncclCommDeregister APIs with process isolation
*
* This test suite verifies that:
* 1. A device buffer can be registered with ncclCommRegister (API returns success)
* 2. When NCCL_LOCAL_REGISTER=1, the registration returns a valid (non-NULL) handle
* 3. When NCCL_LOCAL_REGISTER is not set, NULL handle is expected (default behavior)
* 4. The buffer can be deregistered with ncclCommDeregister
*
* Note: NCCL_LOCAL_REGISTER defaults to 0 (disabled) in RCCL.
*/
TEST(Register, ProcessIsolatedRegisterTests)
{
RUN_ISOLATED_TESTS(
// CommRegisterDeregister tests
makeDisabledConfig("CommRegisterDeregister_Disabled",
[]() { testCommRegisterDeregister(false); }),
makeEnabledConfig("CommRegisterDeregister_Enabled",
[]() { testCommRegisterDeregister(true); }),
// MultipleBufferRegistration tests
makeDisabledConfig("MultipleBufferRegistration_Disabled",
[]() { testMultipleBufferRegistration(false); }),
makeEnabledConfig("MultipleBufferRegistration_Enabled",
[]() { testMultipleBufferRegistration(true); }),
// VariableSizeBuffers tests
makeDisabledConfig("VariableSizeBuffers_Disabled",
[]() { testVariableSizeBuffers(false); }),
makeEnabledConfig("VariableSizeBuffers_Enabled",
[]() { testVariableSizeBuffers(true); }),
// DeregisterNullHandle test (no enable/disable variants needed)
ProcessIsolatedTestRunner::TestConfig("DeregisterNullHandle", testDeregisterNullHandle)
);
}
} // namespace RcclUnitTesting
@@ -402,6 +402,9 @@ bool ProcessIsolatedTestRunner::executeAllTests(const ExecutionOptions& options)
continue;
}
// Flush all output before fork to prevent child from inheriting unflushed buffers
fflush(NULL);
pid_t pid = fork();
if(pid == 0)
@@ -435,6 +438,10 @@ bool ProcessIsolatedTestRunner::executeAllTests(const ExecutionOptions& options)
{
INFO("Running isolated test '%s' (PID: %d)\n", testConfig.name.c_str(), pid);
}
// Flush parent's output before reading from child pipes to ensure proper ordering
fflush(stdout);
fflush(stderr);
int status;
CapturedOutput output = captureProcessOutput(stdout_fd, stderr_fd, pid, &status);
@@ -442,6 +449,9 @@ bool ProcessIsolatedTestRunner::executeAllTests(const ExecutionOptions& options)
auto duration
= std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime);
// Display captured output BEFORE status messages for proper sequencing
displayCapturedOutput(output, testConfig.name);
TestResult testResult;
testResult.testName = testConfig.name;
testResult.processId = pid;
@@ -530,8 +540,6 @@ bool ProcessIsolatedTestRunner::executeAllTests(const ExecutionOptions& options)
testResult.errorMessage = "Failed to wait for process";
}
displayCapturedOutput(output, testConfig.name);
recordTestResult(testResult);
// Stop on first failure if requested