diff --git a/projects/rccl/test/AllocTests.cpp b/projects/rccl/test/AllocTests.cpp index 15178c044f..5b01bebd8a 100644 --- a/projects/rccl/test/AllocTests.cpp +++ b/projects/rccl/test/AllocTests.cpp @@ -4,150 +4,187 @@ * See LICENSE.txt for license information ************************************************************************/ +#include #include #include -#include #include "TestBed.hpp" +#include "common/ErrCode.hpp" +#include "common/ProcessIsolatedTestRunner.hpp" template ncclResult_t ncclCudaMemcpy(float*, float*, size_t); + namespace RcclUnitTesting { - TEST(Alloc, ncclIbMallocDebugNonZero) { - void* ptr = nullptr; - size_t size = 4096; +TEST(Alloc, ncclIbMallocDebugNonZero) +{ + void* ptr = nullptr; + size_t size = 4096; - ncclResult_t result = ncclIbMalloc(&ptr, size); + ncclResult_t result = ncclIbMalloc(&ptr, size); - EXPECT_EQ(result, ncclSuccess); - ASSERT_NE(ptr, nullptr); + EXPECT_EQ(result, ncclSuccess); + ASSERT_NE(ptr, nullptr); - char* char_ptr = static_cast(ptr); - for (size_t i = 0; i < size; ++i) { - ASSERT_EQ(char_ptr[i], 0); - } - - free(ptr); - } - - TEST(Alloc, ncclIbMallocDebugZeroSize) { - void* ptr = (void*)0xdeadbeef; - ncclResult_t result = ncclIbMalloc(&ptr, 0); - - EXPECT_EQ(result, ncclSuccess); - EXPECT_EQ(ptr, nullptr); - } - - - TEST(Alloc, ncclCuMemHostAlloc) { - void* ptr = NULL; - void* handle = NULL; - size_t size = 1024; - ncclResult_t result = ncclCuMemHostAlloc(&ptr, handle, size); - ASSERT_EQ(result, ncclInternalError); - } - - TEST(Alloc, ncclCuMemHostFree) + char* char_ptr = static_cast(ptr); + for(size_t i = 0; i < size; ++i) { - void* dummyPtr = reinterpret_cast(0x1234); // any dummy address - ncclResult_t result = ncclCuMemHostFree(dummyPtr); - ASSERT_EQ(result, ncclInternalError); + ASSERT_EQ(char_ptr[i], 0); } + free(ptr); +} + +TEST(Alloc, ncclIbMallocDebugZeroSize) +{ + void* ptr = (void*)0xdeadbeef; + ncclResult_t result = ncclIbMalloc(&ptr, 0); + + EXPECT_EQ(result, ncclSuccess); + EXPECT_EQ(ptr, nullptr); +} + +TEST(Alloc, ncclCuMemHostAlloc) +{ + void* ptr = NULL; + void* handle = NULL; + size_t size = 1024; + ncclResult_t result = ncclCuMemHostAlloc(&ptr, handle, size); + ASSERT_EQ(result, ncclInternalError); +} + +TEST(Alloc, ncclCuMemHostFree) +{ + void* dummyPtr = reinterpret_cast(0x1234); // any dummy address + ncclResult_t result = ncclCuMemHostFree(dummyPtr); + ASSERT_EQ(result, ncclInternalError); +} + #if ROCM_VERSION < 70000 - // This test is only valid for ROCm versions < 7.0.0 - // In ROCm 7.0.0+, the ncclCuMemAlloc signature changed - TEST(Alloc, ncclCuMemAlloc) - { - void* ptr = reinterpret_cast(0x1234); // dummy non-null input - void* handle = reinterpret_cast(0x5678); // dummy non-null input - size_t size = 1024; - hipMemAllocationHandleType type = hipMemHandleTypeNone; - ncclResult_t result = ncclCuMemAlloc(&ptr, &handle, type, size); - EXPECT_EQ(result, ncclInternalError); - } +// This test is only valid for ROCm versions < 7.0.0 +// In ROCm 7.0.0+, the ncclCuMemAlloc signature changed +TEST(Alloc, ncclCuMemAlloc) +{ + void* ptr = reinterpret_cast(0x1234); // dummy non-null input + void* handle = reinterpret_cast(0x5678); // dummy non-null input + size_t size = 1024; + hipMemAllocationHandleType type = hipMemHandleTypeNone; + ncclResult_t result = ncclCuMemAlloc(&ptr, &handle, type, size); + EXPECT_EQ(result, ncclInternalError); +} - TEST(Alloc, ncclCuMemFree) - { - void* dummyPtr = reinterpret_cast(0xdeadbeef); // arbitrary non-null - ncclResult_t result = ncclCuMemFree(dummyPtr); - EXPECT_EQ(result, ncclInternalError); - } +TEST(Alloc, ncclCuMemFree) +{ + void* dummyPtr = reinterpret_cast(0xdeadbeef); // arbitrary non-null + ncclResult_t result = ncclCuMemFree(dummyPtr); + EXPECT_EQ(result, ncclInternalError); +} - TEST(Alloc, ncclCuMemAllocAddr) - { - void* ptr = reinterpret_cast(0x1111); // Dummy non-null input - hipMemGenericAllocationHandle_t handle = reinterpret_cast(0x1234); - size_t size = 4096; - ncclResult_t result = ncclCuMemAllocAddr(&ptr, &handle, size); - ASSERT_EQ(result, ncclInternalError); - } +TEST(Alloc, ncclCuMemAllocAddr) +{ + void* ptr = reinterpret_cast(0x1111); // Dummy non-null input + hipMemGenericAllocationHandle_t handle + = reinterpret_cast(0x1234); + size_t size = 4096; + ncclResult_t result = ncclCuMemAllocAddr(&ptr, &handle, size); + ASSERT_EQ(result, ncclInternalError); +} - TEST(Alloc, ncclCuMemFreeAddr) - { - void* testPtr = reinterpret_cast(0xbeefcafe); // Arbitrary non-null pointer - ncclResult_t result = ncclCuMemFreeAddr(testPtr); - ASSERT_EQ(result, ncclInternalError); - } +TEST(Alloc, ncclCuMemFreeAddr) +{ + void* testPtr = reinterpret_cast(0xbeefcafe); // Arbitrary non-null pointer + ncclResult_t result = ncclCuMemFreeAddr(testPtr); + ASSERT_EQ(result, ncclInternalError); +} #endif // ROCM_VERSION < 70000 - TEST(Alloc, NcclCudaMemcpy) { - constexpr size_t N = 128; - float *d_src = nullptr, *d_dst = nullptr; - float h_src[N], h_dst[N]; +TEST(Alloc, NcclCudaMemcpy) +{ + RUN_ISOLATED_TEST( + "NcclCudaMemcpy", + []() + { + constexpr size_t N = 128; + float * d_src = nullptr, *d_dst = nullptr; + float h_src[N], h_dst[N]; - for (size_t i = 0; i < N; ++i) h_src[i] = static_cast(i + 1); - // Allocate device memory + for(size_t i = 0; i < N; ++i) + h_src[i] = static_cast(i + 1); + // Allocate device memory - ASSERT_EQ(hipMalloc(&d_src, N * sizeof(float)), hipSuccess); - ASSERT_EQ(hipMalloc(&d_dst, N * sizeof(float)), hipSuccess); + ASSERT_EQ(hipMalloc(&d_src, N * sizeof(float)), hipSuccess); + ASSERT_EQ(hipMalloc(&d_dst, N * sizeof(float)), hipSuccess); - // Copy from host to device (source buffer) - ASSERT_EQ(hipMemcpy(d_src, h_src, N * sizeof(float), hipMemcpyHostToDevice), hipSuccess); + // Copy from host to device (source buffer) + ASSERT_EQ( + hipMemcpy(d_src, h_src, N * sizeof(float), hipMemcpyHostToDevice), + hipSuccess + ); - // Perform the tested function - ncclResult_t result = ncclCudaMemcpy(d_dst, d_src, N); + // Perform the tested function + ncclResult_t result = ncclCudaMemcpy(d_dst, d_src, N); - ASSERT_EQ(result, ncclSuccess); // Fixed typo: was ncclSsuccess + ASSERT_EQ(result, ncclSuccess); - // Copy result back to host - ASSERT_EQ(hipMemcpy(h_dst, d_dst, N * sizeof(float), hipMemcpyDeviceToHost), hipSuccess); + // Copy result back to host + ASSERT_EQ( + hipMemcpy(h_dst, d_dst, N * sizeof(float), hipMemcpyDeviceToHost), + hipSuccess + ); - // Check correctness - for (size_t i = 0; i < N; ++i) { - EXPECT_EQ(h_src[i], h_dst[i]) << "Mismatch at index " << i; + // Check correctness + for(size_t i = 0; i < N; ++i) + { + EXPECT_EQ(h_src[i], h_dst[i]) << "Mismatch at index " << i; + } + // Free memory + hipFree(d_src); + hipFree(d_dst); } - // Free memory - hipFree(d_src); - hipFree(d_dst); + ); +} - } +TEST(Alloc, ZeroElementMemcpy) +{ + RUN_ISOLATED_TEST( + "ZeroElementMemcpy", + []() + { + float *d_src = nullptr, *d_dst = nullptr; + ASSERT_EQ(hipMalloc(&d_src, sizeof(float)), hipSuccess); + ASSERT_EQ(hipMalloc(&d_dst, sizeof(float)), hipSuccess); - TEST(Alloc, ZeroElementMemcpy) { - float *d_src = nullptr, *d_dst = nullptr; - ASSERT_EQ(hipMalloc(&d_src, sizeof(float)), hipSuccess); - ASSERT_EQ(hipMalloc(&d_dst, sizeof(float)), hipSuccess); + ncclResult_t result = ncclCudaMemcpy(d_dst, d_src, 0); + EXPECT_EQ(result, ncclSuccess) << "Zero-element copy should succeed (no-op)"; - ncclResult_t result = ncclCudaMemcpy(d_dst, d_src, 0); - EXPECT_EQ(result, ncclSuccess) << "Zero-element copy should succeed (no-op)"; + hipFree(d_src); + hipFree(d_dst); + } + ); +} - hipFree(d_src); - hipFree(d_dst); - } +TEST(Alloc, MemcpyNullSrcOrDstPointer) +{ + RUN_ISOLATED_TEST( + "MemcpyNullSrcOrDstPointer", + []() + { + constexpr size_t N = 16; + float* d_valid = nullptr; + ASSERT_EQ(hipMalloc(&d_valid, N * sizeof(float)), hipSuccess); - TEST(Alloc, MemcpyNullSrcOrDstPointer) { - constexpr size_t N = 16; - float* d_valid = nullptr; - ASSERT_EQ(hipMalloc(&d_valid, N * sizeof(float)), hipSuccess); + // Case 1: src is nullptr + ncclResult_t result = ncclCudaMemcpy(d_valid, nullptr, N); + EXPECT_EQ(result, ncclUnhandledCudaError) + << "Expected ncclUnhandledCudaError when src is nullptr"; - // Case 1: src is nullptr - ncclResult_t result = ncclCudaMemcpy(d_valid, nullptr, N); - EXPECT_EQ(result, ncclUnhandledCudaError) << "Expected ncclUnhandledCudaError when src is nullptr"; + // Case 2: dst is nullptr + result = ncclCudaMemcpy(nullptr, d_valid, N); + EXPECT_EQ(result, ncclUnhandledCudaError) + << "Expected ncclUnhandledCudaError when dst is nullptr"; - // Case 2: dst is nullptr - result = ncclCudaMemcpy(nullptr, d_valid, N); - EXPECT_EQ(result, ncclUnhandledCudaError) << "Expected ncclUnhandledCudaError when dst is nullptr"; - - hipFree(d_valid); - } -} //namespace rccl \ No newline at end of file + hipFree(d_valid); + } + ); +} +} // namespace RcclUnitTesting \ No newline at end of file diff --git a/projects/rccl/test/ArgCheckTests.cpp b/projects/rccl/test/ArgCheckTests.cpp index dd9bc9bcf3..06c7f00b02 100644 --- a/projects/rccl/test/ArgCheckTests.cpp +++ b/projects/rccl/test/ArgCheckTests.cpp @@ -4,324 +4,626 @@ * See LICENSE.txt for license information ************************************************************************/ #include +#include #include "argcheck.h" #include "comm.h" -#include +#include "common/ErrCode.hpp" +#include "common/ProcessIsolatedTestRunner.hpp" -class ArgCheckTest : public ::testing::Test { -protected: - ncclComm_t comm; - struct ncclInfo *info; - int *sendDevicePtr = nullptr; - int *recvDevicePtr = nullptr; +// Helper struct for ArgCheck tests (NOT a fixture - used inside isolated tests) +struct ArgCheckTestEnvironment +{ + ncclComm_t comm; + struct ncclInfo* info; + int* sendDevicePtr = nullptr; + int* recvDevicePtr = nullptr; - // Helper function to set up valid ncclInfo for boundary testing - void SetupValidInfo() { - // Set up valid info structure - info->comm = comm; - info->root = 0; // Valid root - info->datatype = (ncclDataType_t)0; // Valid datatype - info->op = (ncclRedOp_t)0; // Valid reduction operation - info->coll = ncclFuncBroadcast; // Valid collective operation - info->sendbuff = nullptr; // Will be set per test if needed - info->recvbuff = nullptr; // Will be set per test if needed - info->count = 10; // Valid count - info->opName = "TestOp"; // Valid operation name - } - - // Helper function for tests requiring device memory - void SetupValidBufferWithDeviceMemory() { - // Set the active device to match comm->cudaDev - hipError_t errSetDevice = hipSetDevice(comm->cudaDev); - ASSERT_EQ(errSetDevice, hipSuccess); - - // Allocate device memory - hipError_t errSend = hipMalloc(&sendDevicePtr, sizeof(int)); - ASSERT_EQ(errSend, hipSuccess); - hipError_t errRecv = hipMalloc(&recvDevicePtr, sizeof(int)); - ASSERT_EQ(errRecv, hipSuccess); - - // Set device pointers - info->sendbuff = sendDevicePtr; - info->recvbuff = recvDevicePtr; - } - - // Helper to clean up device memory - void CleanupDeviceMemory() { - if (sendDevicePtr) { - hipFree(sendDevicePtr); - sendDevicePtr = nullptr; + // Helper function to set up valid ncclInfo for boundary testing + void SetupValidInfo() + { + // Set up valid info structure + info->comm = comm; + info->root = 0; // Valid root + info->datatype = (ncclDataType_t)0; // Valid datatype + info->op = (ncclRedOp_t)0; // Valid reduction operation + info->coll = ncclFuncBroadcast; // Valid collective operation + info->sendbuff = nullptr; // Will be set per test if needed + info->recvbuff = nullptr; // Will be set per test if needed + info->count = 10; // Valid count + info->opName = "TestOp"; // Valid operation name } - if (recvDevicePtr) { - hipFree(recvDevicePtr); - recvDevicePtr = nullptr; + + // Helper function for tests requiring device memory + void SetupValidBufferWithDeviceMemory() + { + // Set the active device to match comm->cudaDev + hipError_t errSetDevice = hipSetDevice(comm->cudaDev); + ASSERT_EQ(errSetDevice, hipSuccess); + + // Allocate device memory + hipError_t errSend = hipMalloc(&sendDevicePtr, sizeof(int)); + ASSERT_EQ(errSend, hipSuccess); + hipError_t errRecv = hipMalloc(&recvDevicePtr, sizeof(int)); + ASSERT_EQ(errRecv, hipSuccess); + + // Set device pointers + info->sendbuff = sendDevicePtr; + info->recvbuff = recvDevicePtr; } - } - void SetUp() override { - // Allocate and zero-initialize ncclComm as a pointer - comm = (struct ncclComm *)calloc(1, sizeof(struct ncclComm)); - ASSERT_NE(comm, nullptr) << "Failed to allocate ncclComm"; - - // Initialize the communicator with required fields - comm->cudaDev = 0; - comm->nRanks = 4; - comm->checkPointers = true; - comm->rank = 0; - - comm->startMagic = NCCL_MAGIC; - comm->endMagic = NCCL_MAGIC; - - // Verify the magic values were set correctly - ASSERT_EQ(comm->startMagic, NCCL_MAGIC) << "startMagic not set correctly"; - ASSERT_EQ(comm->endMagic, NCCL_MAGIC) << "endMagic not set correctly"; - - // Allocate and zero-initialize ncclInfo as a pointer - info = (ncclInfo *)calloc(1, sizeof(ncclInfo)); - ASSERT_NE(info, nullptr) << "Failed to allocate ncclInfo"; - - SetupValidInfo(); - - SetupValidBufferWithDeviceMemory(); - } - - void TearDown() override { - // Free the allocated memory - CleanupDeviceMemory(); - if (info) { - free(info); - info = nullptr; + // Helper to clean up device memory + void CleanupDeviceMemory() + { + if(sendDevicePtr) + { + hipFree(sendDevicePtr); + sendDevicePtr = nullptr; + } + if(recvDevicePtr) + { + hipFree(recvDevicePtr); + recvDevicePtr = nullptr; + } } - if (comm) { - free(comm); - comm = nullptr; + + void setup() + { + // Allocate and zero-initialize ncclComm as a pointer + comm = (struct ncclComm*)calloc(1, sizeof(struct ncclComm)); + ASSERT_NE(comm, nullptr) << "Failed to allocate ncclComm"; + + // Initialize the communicator with required fields + comm->cudaDev = 0; + comm->nRanks = 4; + comm->checkPointers = true; + comm->rank = 0; + + comm->startMagic = NCCL_MAGIC; + comm->endMagic = NCCL_MAGIC; + + // Verify the magic values were set correctly + ASSERT_EQ(comm->startMagic, NCCL_MAGIC) << "startMagic not set correctly"; + ASSERT_EQ(comm->endMagic, NCCL_MAGIC) << "endMagic not set correctly"; + + // Allocate and zero-initialize ncclInfo as a pointer + info = (ncclInfo*)calloc(1, sizeof(ncclInfo)); + ASSERT_NE(info, nullptr) << "Failed to allocate ncclInfo"; + + SetupValidInfo(); + + SetupValidBufferWithDeviceMemory(); + } + + void cleanup() + { + // Free the allocated memory + CleanupDeviceMemory(); + if(info) + { + free(info); + info = nullptr; + } + if(comm) + { + free(comm); + comm = nullptr; + } } - } }; -TEST_F(ArgCheckTest, CudaPtrCheck_ValidPointer) { - int *devicePtr = nullptr; - hipError_t err = hipMalloc(&devicePtr, sizeof(int)); - ASSERT_EQ(err, hipSuccess); +TEST(ArgCheckTest, CudaPtrCheck_ValidPointer) +{ + RUN_ISOLATED_TEST( + "CudaPtrCheck_ValidPointer", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - ncclResult_t result = CudaPtrCheck(devicePtr, comm, "devicePtr", "TestOp"); - EXPECT_EQ(result, ncclSuccess); + int* devicePtr = nullptr; + hipError_t err = hipMalloc(&devicePtr, sizeof(int)); + ASSERT_EQ(err, hipSuccess); - hipFree(devicePtr); + ncclResult_t result = CudaPtrCheck(devicePtr, env.comm, "devicePtr", "TestOp"); + EXPECT_EQ(result, ncclSuccess); + + hipFree(devicePtr); + env.cleanup(); + INFO("Test 'CudaPtrCheck_ValidPointer' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, CudaPtrCheck_NullPointer) { - ncclResult_t result = CudaPtrCheck(nullptr, comm, "invalidPtr", "TestOp"); - EXPECT_EQ(result, ncclInvalidArgument); +TEST(ArgCheckTest, CudaPtrCheck_NullPointer) +{ + RUN_ISOLATED_TEST( + "CudaPtrCheck_NullPointer", + []() + { + ArgCheckTestEnvironment env; + env.setup(); + + ncclResult_t result = CudaPtrCheck(nullptr, env.comm, "invalidPtr", "TestOp"); + EXPECT_EQ(result, ncclInvalidArgument); + + env.cleanup(); + INFO("Test 'CudaPtrCheck_NullPointer' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, CudaPtrCheck_DifferentDevicePointer) { - int *devicePtr = nullptr; - hipSetDevice(1); - hipError_t err = hipMalloc(&devicePtr, sizeof(int)); - ASSERT_EQ(err, hipSuccess); +TEST(ArgCheckTest, CudaPtrCheck_DifferentDevicePointer) +{ + RUN_ISOLATED_TEST( + "CudaPtrCheck_DifferentDevicePointer", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - ncclResult_t result = CudaPtrCheck(devicePtr, comm, "devicePtr", "TestOp"); - EXPECT_EQ(result, ncclInvalidArgument); + int* devicePtr = nullptr; + hipSetDevice(1); + hipError_t err = hipMalloc(&devicePtr, sizeof(int)); + ASSERT_EQ(err, hipSuccess); - hipFree(devicePtr); - hipSetDevice(comm->cudaDev); + ncclResult_t result = CudaPtrCheck(devicePtr, env.comm, "devicePtr", "TestOp"); + EXPECT_EQ(result, ncclInvalidArgument); + + hipFree(devicePtr); + hipSetDevice(env.comm->cudaDev); + + env.cleanup(); + INFO("Test 'CudaPtrCheck_DifferentDevicePointer' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, CudaPtrCheck_HostMemoryPointer) { - // Test with host memory instead of device memory - int *hostPtr = (int *)malloc(sizeof(int)); - ASSERT_NE(hostPtr, nullptr) << "Failed to allocate host memory"; +TEST(ArgCheckTest, CudaPtrCheck_HostMemoryPointer) +{ + RUN_ISOLATED_TEST( + "CudaPtrCheck_HostMemoryPointer", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - *hostPtr = 42; // Initialize the memory + // Test with host memory instead of device memory + int* hostPtr = (int*)malloc(sizeof(int)); + ASSERT_NE(hostPtr, nullptr) << "Failed to allocate host memory"; - // This should fail because host memory is not device memory - ncclResult_t result = CudaPtrCheck(hostPtr, comm, "hostPtr", "TestOp"); + *hostPtr = 42; // Initialize the memory - // Host memory should be rejected by CudaPtrCheck - EXPECT_EQ(result, ncclInvalidArgument) - << "Host memory should be rejected by CudaPtrCheck"; + // This should fail because host memory is not device memory + ncclResult_t result = CudaPtrCheck(hostPtr, env.comm, "hostPtr", "TestOp"); - free(hostPtr); + // Host memory should be rejected by CudaPtrCheck + EXPECT_EQ(result, ncclInvalidArgument) + << "Host memory should be rejected by CudaPtrCheck"; + + free(hostPtr); + + env.cleanup(); + INFO("Test 'CudaPtrCheck_HostMemoryPointer' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, PtrCheck_ValidPointer) { - int value = 42; - ncclResult_t result = PtrCheck(&value, "TestOp", "value"); - ASSERT_EQ(result, ncclSuccess); +TEST(ArgCheckTest, PtrCheck_ValidPointer) +{ + RUN_ISOLATED_TEST( + "PtrCheck_ValidPointer", + []() + { + int value = 42; + ncclResult_t result = PtrCheck(&value, "TestOp", "value"); + ASSERT_EQ(result, ncclSuccess); + INFO("Test 'PtrCheck_ValidPointer' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, PtrCheck_NullPointer) { - ncclResult_t result = PtrCheck(nullptr, "TestOp", "value"); - ASSERT_EQ(result, ncclInvalidArgument); +TEST(ArgCheckTest, PtrCheck_NullPointer) +{ + RUN_ISOLATED_TEST( + "PtrCheck_NullPointer", + []() + { + ncclResult_t result = PtrCheck(nullptr, "TestOp", "value"); + ASSERT_EQ(result, ncclInvalidArgument); + INFO("Test 'PtrCheck_NullPointer' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, CommCheck_ValidComm) { - comm->startMagic = NCCL_MAGIC; - comm->endMagic = NCCL_MAGIC; +TEST(ArgCheckTest, CommCheck_ValidComm) +{ + RUN_ISOLATED_TEST( + "CommCheck_ValidComm", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - // Verify magic values are still correct (should be set in SetUp()) - ASSERT_EQ(comm->startMagic, NCCL_MAGIC) << "startMagic was corrupted"; - ASSERT_EQ(comm->endMagic, NCCL_MAGIC) << "endMagic was corrupted"; + env.comm->startMagic = NCCL_MAGIC; + env.comm->endMagic = NCCL_MAGIC; - // Call CommCheck and verify the result - ncclResult_t result = CommCheck(comm, "TestOp", "testComm"); - EXPECT_EQ(result, ncclSuccess) << "Failed for valid communicator"; + // Verify magic values are still correct (should be set in setup()) + ASSERT_EQ(env.comm->startMagic, NCCL_MAGIC) << "startMagic was corrupted"; + ASSERT_EQ(env.comm->endMagic, NCCL_MAGIC) << "endMagic was corrupted"; + + // Call CommCheck and verify the result + ncclResult_t result = CommCheck(env.comm, "TestOp", "testComm"); + EXPECT_EQ(result, ncclSuccess) << "Failed for valid communicator"; + + env.cleanup(); + INFO("Test 'CommCheck_ValidComm' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, CommCheck_NullComm) { - ncclResult_t result = CommCheck(nullptr, "TestOp", "comm"); - ASSERT_EQ(result, ncclInvalidArgument); +TEST(ArgCheckTest, CommCheck_NullComm) +{ + RUN_ISOLATED_TEST( + "CommCheck_NullComm", + []() + { + ncclResult_t result = CommCheck(nullptr, "TestOp", "comm"); + ASSERT_EQ(result, ncclInvalidArgument); + INFO("Test 'CommCheck_NullComm' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, CommCheck_CorruptedStartMagic) { - // Corrupt only startMagic, keep endMagic valid - comm->startMagic = 1; // Corrupt startMagic - comm->endMagic = NCCL_MAGIC; // Keep endMagic valid +TEST(ArgCheckTest, CommCheck_CorruptedStartMagic) +{ + RUN_ISOLATED_TEST( + "CommCheck_CorruptedStartMagic", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - // Call CommCheck and verify the result - ncclResult_t result = CommCheck(comm, "TestOp", "comm"); - EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted startMagic"; + // Corrupt only startMagic, keep endMagic valid + env.comm->startMagic = 1; // Corrupt startMagic + env.comm->endMagic = NCCL_MAGIC; // Keep endMagic valid + + // Call CommCheck and verify the result + ncclResult_t result = CommCheck(env.comm, "TestOp", "comm"); + EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted startMagic"; + + env.cleanup(); + INFO("Test 'CommCheck_CorruptedStartMagic' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, CommCheck_CorruptedEndMagic) { - // Keep startMagic valid, corrupt only endMagic - comm->startMagic = NCCL_MAGIC; // Keep startMagic valid - comm->endMagic = 1; // Corrupt endMagic +TEST(ArgCheckTest, CommCheck_CorruptedEndMagic) +{ + RUN_ISOLATED_TEST( + "CommCheck_CorruptedEndMagic", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - // Call CommCheck and verify the result - ncclResult_t result = CommCheck(comm, "TestOp", "comm"); - EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted endMagic"; + // Keep startMagic valid, corrupt only endMagic + env.comm->startMagic = NCCL_MAGIC; // Keep startMagic valid + env.comm->endMagic = 1; // Corrupt endMagic + + // Call CommCheck and verify the result + ncclResult_t result = CommCheck(env.comm, "TestOp", "comm"); + EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted endMagic"; + + env.cleanup(); + INFO("Test 'CommCheck_CorruptedEndMagic' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, CommCheck_CorruptedBothMagics) { - // Corrupt both startMagic and endMagic - comm->startMagic = 1; // Corrupt startMagic - comm->endMagic = 1; // Corrupt endMagic +TEST(ArgCheckTest, CommCheck_CorruptedBothMagics) +{ + RUN_ISOLATED_TEST( + "CommCheck_CorruptedBothMagics", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - // Call CommCheck and verify the result - ncclResult_t result = CommCheck(comm, "TestOp", "comm"); - EXPECT_EQ(result, ncclInvalidArgument) - << "Failed for corrupted both magic values"; + // Corrupt both startMagic and endMagic + env.comm->startMagic = 1; // Corrupt startMagic + env.comm->endMagic = 1; // Corrupt endMagic + + // Call CommCheck and verify the result + ncclResult_t result = CommCheck(env.comm, "TestOp", "comm"); + EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted both magic values"; + + env.cleanup(); + INFO("Test 'CommCheck_CorruptedBothMagics' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, ArgsCheck_InvalidRoot_NegativeValue) { - info->root = -1; // Invalid root (< 0) +TEST(ArgCheckTest, ArgsCheck_InvalidRoot_NegativeValue) +{ + RUN_ISOLATED_TEST( + "ArgsCheck_InvalidRoot_NegativeValue", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - ncclResult_t result = ArgsCheck(info); - EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid root < 0"; + env.info->root = -1; // Invalid root (< 0) + + ncclResult_t result = ArgsCheck(env.info); + EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid root < 0"; + + env.cleanup(); + INFO("Test 'ArgsCheck_InvalidRoot_NegativeValue' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, ArgsCheck_InvalidRoot_ExceedsNRanks) { - info->root = comm->nRanks; // Invalid root (>= nRanks) +TEST(ArgCheckTest, ArgsCheck_InvalidRoot_ExceedsNRanks) +{ + RUN_ISOLATED_TEST( + "ArgsCheck_InvalidRoot_ExceedsNRanks", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - ncclResult_t result = ArgsCheck(info); - EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid root >= nRanks"; + env.info->root = env.comm->nRanks; // Invalid root (>= nRanks) + + ncclResult_t result = ArgsCheck(env.info); + EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid root >= nRanks"; + + env.cleanup(); + INFO("Test 'ArgsCheck_InvalidRoot_ExceedsNRanks' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, ArgsCheck_InvalidDatatype_NegativeValue) { - info->datatype = (ncclDataType_t)-1; // Invalid datatype (< 0) +TEST(ArgCheckTest, ArgsCheck_InvalidDatatype_NegativeValue) +{ + RUN_ISOLATED_TEST( + "ArgsCheck_InvalidDatatype_NegativeValue", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - ncclResult_t result = ArgsCheck(info); - EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid datatype < 0"; + env.info->datatype = (ncclDataType_t)-1; // Invalid datatype (< 0) + + ncclResult_t result = ArgsCheck(env.info); + EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid datatype < 0"; + + env.cleanup(); + INFO("Test 'ArgsCheck_InvalidDatatype_NegativeValue' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, ArgsCheck_InvalidDatatype_ExceedsMaxValue) { - info->datatype = - (ncclDataType_t)ncclNumTypes; // Invalid datatype (>= ncclNumTypes) +TEST(ArgCheckTest, ArgsCheck_InvalidDatatype_ExceedsMaxValue) +{ + RUN_ISOLATED_TEST( + "ArgsCheck_InvalidDatatype_ExceedsMaxValue", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - ncclResult_t result = ArgsCheck(info); - EXPECT_EQ(result, ncclInvalidArgument) - << "Failed for invalid datatype >= ncclNumTypes"; + env.info->datatype = (ncclDataType_t)ncclNumTypes; // Invalid datatype (>= ncclNumTypes) + + ncclResult_t result = ArgsCheck(env.info); + EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid datatype >= ncclNumTypes"; + + env.cleanup(); + INFO("Test 'ArgsCheck_InvalidDatatype_ExceedsMaxValue' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, ArgsCheck_InvalidReductionOperation_NegativeValue) { - info->op = (ncclRedOp_t)-1; // Invalid reduction operation (< 0) +TEST(ArgCheckTest, ArgsCheck_InvalidReductionOperation_NegativeValue) +{ + RUN_ISOLATED_TEST( + "ArgsCheck_InvalidReductionOperation_NegativeValue", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - ncclResult_t result = ArgsCheck(info); - EXPECT_EQ(result, ncclInvalidArgument) - << "Failed for invalid reduction operation < 0"; + env.info->op = (ncclRedOp_t)-1; // Invalid reduction operation (< 0) + + ncclResult_t result = ArgsCheck(env.info); + EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid reduction operation < 0"; + + env.cleanup(); + INFO("Test 'ArgsCheck_InvalidReductionOperation_NegativeValue' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, ArgsCheck_InvalidReductionOperation_ExceedsMaxValue) { - info->op = - (ncclRedOp_t)ncclNumOps; // Invalid reduction operation (>= ncclNumOps) +TEST(ArgCheckTest, ArgsCheck_InvalidReductionOperation_ExceedsMaxValue) +{ + RUN_ISOLATED_TEST( + "ArgsCheck_InvalidReductionOperation_ExceedsMaxValue", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - ncclResult_t result = ArgsCheck(info); - EXPECT_EQ(result, ncclInvalidArgument) - << "Failed for invalid reduction operation >= ncclNumOps"; + env.info->op = (ncclRedOp_t)ncclNumOps; // Invalid reduction operation (>= ncclNumOps) + + ncclResult_t result = ArgsCheck(env.info); + EXPECT_EQ(result, ncclInvalidArgument) + << "Failed for invalid reduction operation >= ncclNumOps"; + + env.cleanup(); + INFO("Test 'ArgsCheck_InvalidReductionOperation_ExceedsMaxValue' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, ArgsCheck_InvalidCommunicatorPointers) { - info->op = (ncclRedOp_t)0; // Valid reduction operation - if (info->sendbuff) { - hipFree((void *)info->sendbuff); - info->sendbuff = nullptr; // Invalid send buffer - } - if (info->recvbuff) { - hipFree((void *)info->recvbuff); - info->recvbuff = nullptr; // Invalid receive buffer - } +TEST(ArgCheckTest, ArgsCheck_InvalidCommunicatorPointers) +{ + RUN_ISOLATED_TEST( + "ArgsCheck_InvalidCommunicatorPointers", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - ncclResult_t result = ArgsCheck(info); - EXPECT_EQ(result, ncclInvalidArgument) - << "Failed for invalid communicator pointers"; + env.info->op = (ncclRedOp_t)0; // Valid reduction operation + if(env.info->sendbuff) + { + hipFree((void*)env.info->sendbuff); + env.info->sendbuff = nullptr; // Invalid send buffer + } + if(env.info->recvbuff) + { + hipFree((void*)env.info->recvbuff); + env.info->recvbuff = nullptr; // Invalid receive buffer + } + + ncclResult_t result = ArgsCheck(env.info); + EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid communicator pointers"; + + env.cleanup(); + INFO("Test 'ArgsCheck_InvalidCommunicatorPointers' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, ArgsCheck_InvalidReductionOperationOutOfRange) { - info->op = (ncclRedOp_t)5; // Invalid reduction operation (out of range) +TEST(ArgCheckTest, ArgsCheck_InvalidReductionOperationOutOfRange) +{ + RUN_ISOLATED_TEST( + "ArgsCheck_InvalidReductionOperationOutOfRange", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - ncclResult_t result = ArgsCheck(info); - EXPECT_EQ(result, ncclInvalidArgument) - << "Failed for invalid reduction operation"; + env.info->op = (ncclRedOp_t)5; // Invalid reduction operation (out of range) + + ncclResult_t result = ArgsCheck(env.info); + EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid reduction operation"; + + env.cleanup(); + INFO("Test 'ArgsCheck_InvalidReductionOperationOutOfRange' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, ArgsCheck_UserDefinedReductionOperationInvalid) { - // Test case: User-defined reduction operation with freeNext != -1 - info->op = (ncclRedOp_t)(ncclNumOps + - 1); // Set op to a user-defined reduction operation +TEST(ArgCheckTest, ArgsCheck_UserDefinedReductionOperationInvalid) +{ + RUN_ISOLATED_TEST( + "ArgsCheck_UserDefinedReductionOperationInvalid", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - ncclResult_t result = ArgsCheck(info); - EXPECT_EQ(result, ncclInvalidArgument) - << "Failed for user-defined reduction operation with freeNext != -1"; + // Test case: User-defined reduction operation with freeNext != -1 + env.info->op + = (ncclRedOp_t)(ncclNumOps + 1); // Set op to a user-defined reduction operation + + ncclResult_t result = ArgsCheck(env.info); + EXPECT_EQ(result, ncclInvalidArgument) + << "Failed for user-defined reduction operation with freeNext != -1"; + + env.cleanup(); + INFO("Test 'ArgsCheck_UserDefinedReductionOperationInvalid' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, ArgsCheck_SendAndRecvFunction) { - info->recvbuff = - recvDevicePtr; // Use allocated device pointer for receive buffer +TEST(ArgCheckTest, ArgsCheck_SendAndRecvFunction) +{ + RUN_ISOLATED_TEST( + "ArgsCheck_SendAndRecvFunction", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - // Test both ncclFuncSend and ncclFuncRecv - for (auto coll : {ncclFuncSend, ncclFuncRecv}) { - info->coll = coll; // Set the collective operation + env.info->recvbuff + = env.recvDevicePtr; // Use allocated device pointer for receive buffer - // Call ArgsCheck and verify the result - ncclResult_t result = ArgsCheck(info); - ASSERT_EQ(result, ncclSuccess) << "Failed for coll = " << coll; - } + // Test both ncclFuncSend and ncclFuncRecv + for(auto coll : {ncclFuncSend, ncclFuncRecv}) + { + env.info->coll = coll; // Set the collective operation + + // Call ArgsCheck and verify the result + ncclResult_t result = ArgsCheck(env.info); + ASSERT_EQ(result, ncclSuccess) << "Failed for coll = " << coll; + } + + env.cleanup(); + INFO("Test 'ArgsCheck_SendAndRecvFunction' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, ArgsCheck_CollNotReduce) { - // Case: info->coll != ncclFuncReduce - info->coll = ncclFuncBroadcast; // Set coll to ncclFuncBroadcast +TEST(ArgCheckTest, ArgsCheck_CollNotReduce) +{ + RUN_ISOLATED_TEST( + "ArgsCheck_CollNotReduce", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - ncclResult_t result = ArgsCheck(info); - EXPECT_EQ(result, ncclSuccess) << "Failed for coll != ncclFuncReduce"; + // Case: env.info->coll != ncclFuncReduce + env.info->coll = ncclFuncBroadcast; // Set coll to ncclFuncBroadcast + + ncclResult_t result = ArgsCheck(env.info); + EXPECT_EQ(result, ncclSuccess) << "Failed for coll != ncclFuncReduce"; + + env.cleanup(); + INFO("Test 'ArgsCheck_CollNotReduce' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, ArgsCheck_ReduceCollWithRootRank) { - // Case: info->coll == ncclFuncReduce and info->comm->rank == info->root - info->coll = ncclFuncReduce; // Set coll to ncclFuncReduce +TEST(ArgCheckTest, ArgsCheck_ReduceCollWithRootRank) +{ + RUN_ISOLATED_TEST( + "ArgsCheck_ReduceCollWithRootRank", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - ncclResult_t result = ArgsCheck(info); - EXPECT_EQ(result, ncclSuccess) - << "Failed for coll == ncclFuncReduce and rank == root"; + // Case: env.info->coll == ncclFuncReduce and env.info->env.comm->rank == env.info->root + env.info->coll = ncclFuncReduce; // Set coll to ncclFuncReduce + + ncclResult_t result = ArgsCheck(env.info); + EXPECT_EQ(result, ncclSuccess) << "Failed for coll == ncclFuncReduce and rank == root"; + + env.cleanup(); + INFO("Test 'ArgsCheck_ReduceCollWithRootRank' PASSED\n"); + } + ); } -TEST_F(ArgCheckTest, ArgsCheck_ReduceCollWithNonRootRank) { - comm->rank = 1; // Set rank to 1 (non-root) +TEST(ArgCheckTest, ArgsCheck_ReduceCollWithNonRootRank) +{ + RUN_ISOLATED_TEST( + "ArgsCheck_ReduceCollWithNonRootRank", + []() + { + ArgCheckTestEnvironment env; + env.setup(); - ncclResult_t result = ArgsCheck(info); - EXPECT_EQ(result, ncclSuccess) - << "Failed for coll == ncclFuncReduce and rank != root"; + env.comm->rank = 1; // Set rank to 1 (non-root) + + ncclResult_t result = ArgsCheck(env.info); + EXPECT_EQ(result, ncclSuccess) << "Failed for coll == ncclFuncReduce and rank != root"; + + env.cleanup(); + INFO("Test 'ArgsCheck_ReduceCollWithNonRootRank' PASSED\n"); + } + ); } diff --git a/projects/rccl/test/CMakeLists.txt b/projects/rccl/test/CMakeLists.txt index 524eba13b9..6ee377974f 100644 --- a/projects/rccl/test/CMakeLists.txt +++ b/projects/rccl/test/CMakeLists.txt @@ -207,6 +207,7 @@ if(BUILD_TESTS) TransportTests.cpp common/main_fixtures.cpp common/EnvVars.cpp + common/ProcessIsolatedTestRunner.cpp graph/XmlTests.cpp ) diff --git a/projects/rccl/test/EnqueueTests.cpp b/projects/rccl/test/EnqueueTests.cpp index ad357e5b40..ab2c7af03e 100644 --- a/projects/rccl/test/EnqueueTests.cpp +++ b/projects/rccl/test/EnqueueTests.cpp @@ -4,27 +4,120 @@ * See LICENSE.txt for license information ************************************************************************/ #include -#include #include +#include + #include "comm.h" -#include "info.h" +#include "common/ProcessIsolatedTestRunner.hpp" #include "enqueue.h" +#include "info.h" #include "utils.h" -class EnqueueTests : public ::testing::Test { -protected: +namespace RcclUnitTesting +{ + +// Simple test kernel for validating ncclInitKernelsForDevice +__global__ void simpleTestKernel(int* data) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if(data) + data[tid] = tid; +} + +// Helper function to test ncclInitKernelsForDevice with a real kernel +ncclResult_t testKernelAttributes(void* kernelFn, size_t* maxStackSize) +{ + if(!kernelFn || !maxStackSize) + return ncclInvalidArgument; + + *maxStackSize = 0; + hipFuncAttributes attr = {0}; + + hipError_t errcode = hipFuncGetAttributes(&attr, kernelFn); + if(errcode != hipSuccess) + return ncclSystemError; + + *maxStackSize = attr.localSizeBytes; + return ncclSuccess; // ncclSuccess +} + +// Helper function to test shared memory limit checking with a real kernel +// ncclMaxSharedMem: For gfx906 (cudaArch 906) with WarpSize 64, this is typically 32832 bytes +ncclResult_t testKernelSharedMemoryLimit( + void* kernelFn, int cudaArch, int maxSharedMem, size_t* maxStackSize, int ncclMaxSharedMem +) +{ + if(!kernelFn) + return ncclInvalidArgument; + + ncclResult_t result = ncclSuccess; + if(maxStackSize) + *maxStackSize = 0; + + hipFuncAttributes attr = {0}; + hipError_t errcode = hipFuncGetAttributes(&attr, kernelFn); + if(errcode != hipSuccess) + { + return ncclSystemError; + } + + if(maxStackSize) + { + *maxStackSize = attr.localSizeBytes; + } + + // Test the shared memory limit check (mimics enqueue.cc lines 135-146) + if(ncclMaxSharedMem != 0) + { + int sharedMemSize = ncclMaxSharedMem; + + if(sharedMemSize > (maxSharedMem - attr.sharedSizeBytes)) + { + WARN( + "cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu", + cudaArch, + sharedMemSize, + maxSharedMem - attr.sharedSizeBytes + ); + return ncclSystemError; + } + } + + return result; +} + +// Helper structure to hold test environment +struct EnqueueTestEnvironment +{ ncclComm* comm; ncclInfo* info; - void* sendbuff; - void* recvbuff; - static uint32_t abortFlag0, abortFlag1; - static int abortFlagRefCount; + void* sendbuff; + void* recvbuff; + uint32_t abortFlag0; + uint32_t abortFlag1; + int abortFlagRefCount; - void SetUp() override { + EnqueueTestEnvironment() + : comm(nullptr) + , info(nullptr) + , sendbuff(nullptr) + , recvbuff(nullptr) + , abortFlag0(0) + , abortFlag1(0) + , abortFlagRefCount(0) + {} + + ~EnqueueTestEnvironment() + { + cleanup(); + } + + void setup() + { // Allocate GPU memory for buffers - size_t bufferSize = 1024 * sizeof(float); - hipError_t hipErr = hipMalloc(&sendbuff, bufferSize); + size_t bufferSize = 1024 * sizeof(float); + hipError_t hipErr = hipMalloc(&sendbuff, bufferSize); ASSERT_EQ(hipErr, hipSuccess) << "Failed to allocate sendbuff"; hipErr = hipMalloc(&recvbuff, bufferSize); @@ -34,17 +127,17 @@ protected: comm = new ncclComm(); memset(comm, 0, sizeof(ncclComm)); - comm->startMagic = NCCL_MAGIC; // 0x0280028002800280 + comm->startMagic = NCCL_MAGIC; // 0x0280028002800280 // Initialize critical fields - comm->rank = 0; - comm->nRanks = 2; - comm->cudaDev = 0; + comm->rank = 0; + comm->nRanks = 2; + comm->cudaDev = 0; comm->localRank = 0; // Initialize abort flags - comm->abortFlag = &abortFlag0; - comm->childAbortFlag = &abortFlag1; + comm->abortFlag = &abortFlag0; + comm->childAbortFlag = &abortFlag1; comm->abortFlagRefCount = &abortFlagRefCount; // Initialize memory stack @@ -53,15 +146,15 @@ protected: // Initialize intra-communication pointers comm->intraComm0 = nullptr; - comm->intraNext = nullptr; + comm->intraNext = nullptr; // Initialize work FIFO structures - comm->workFifoBytes = 1024; // Power of 2 - comm->workFifoBuf = nullptr; - comm->workFifoBufDev = nullptr; - comm->workFifoConsumed = 0; + comm->workFifoBytes = 1024; // Power of 2 + comm->workFifoBuf = nullptr; + comm->workFifoBufDev = nullptr; + comm->workFifoConsumed = 0; comm->workFifoProducedLastRecorded = 0; - comm->workFifoProduced = 0; + comm->workFifoProduced = 0; // Initialize planner memset(&comm->planner, 0, sizeof(comm->planner)); @@ -69,254 +162,556 @@ protected: // Initialize config memset(&comm->config, 0, sizeof(comm->config)); comm->config.blocking = 1; - comm->checkPointers = 0; // Disable pointer validation for easier testing + comm->checkPointers = 0; // Disable pointer validation for easier testing // Initialize peer info arrays comm->peerInfo = new ncclPeerInfo[comm->nRanks]; memset(comm->peerInfo, 0, comm->nRanks * sizeof(ncclPeerInfo)); comm->localRankToRank = new int[comm->nRanks]; - for (int i = 0; i < comm->nRanks; i++) { + for(int i = 0; i < comm->nRanks; i++) + { comm->localRankToRank[i] = i; } - comm->endMagic = NCCL_MAGIC; // 0x0280028002800280 + comm->endMagic = NCCL_MAGIC; // 0x0280028002800280 // Initialize operation info with valid GPU buffers info = new ncclInfo(); memset(info, 0, sizeof(ncclInfo)); - info->comm = comm; - info->opName = "AllReduce"; - info->count = 1024; + info->comm = comm; + info->opName = "AllReduce"; + info->count = 1024; info->datatype = ncclFloat; - info->op = ncclSum; - info->root = 0; - info->sendbuff = sendbuff; // Use allocated GPU memory - info->recvbuff = recvbuff; // Use allocated GPU memory - info->stream = nullptr; + info->op = ncclSum; + info->root = 0; + info->sendbuff = sendbuff; // Use allocated GPU memory + info->recvbuff = recvbuff; // Use allocated GPU memory + info->stream = nullptr; } - void TearDown() override { - if (sendbuff) { - hipFree(sendbuff); + void cleanup() + { + // Clean up info first (it references comm) + if(info) + { + delete info; + info = nullptr; } - if (recvbuff) { - hipFree(recvbuff); - } - if (comm) { + + // Clean up comm and its allocated resources + if(comm) + { + // Clean up memory stacks ncclMemoryStackDestruct(&comm->memScoped); ncclMemoryStackDestruct(&comm->memPermanent); - delete[] comm->peerInfo; - delete[] comm->localRankToRank; + + // Clean up peer info arrays + if(comm->peerInfo) + { + delete[] comm->peerInfo; + comm->peerInfo = nullptr; + } + + if(comm->localRankToRank) + { + delete[] comm->localRankToRank; + comm->localRankToRank = nullptr; + } + delete comm; + comm = nullptr; } - if (info) { - delete info; + + // Clean up GPU buffers last + if(sendbuff) + { + hipError_t err = hipFree(sendbuff); + if(err != hipSuccess) + { + // Log error but don't throw in cleanup + fprintf(stderr, "Warning: hipFree(sendbuff) failed with error %d\n", err); + } + sendbuff = nullptr; + } + + if(recvbuff) + { + hipError_t err = hipFree(recvbuff); + if(err != hipSuccess) + { + // Log error but don't throw in cleanup + fprintf(stderr, "Warning: hipFree(recvbuff) failed with error %d\n", err); + } + recvbuff = nullptr; } } }; -// Static member definitions -uint32_t EnqueueTests::abortFlag0 = 0; -uint32_t EnqueueTests::abortFlag1 = 0; -int EnqueueTests::abortFlagRefCount = 0; +// Empty test fixture for test organization +class EnqueueTests : public ::testing::Test +{ + // No setup/teardown - all tests use process isolation +}; // Test ncclInitKernelsForDevice function -TEST_F(EnqueueTests, ncclInitKernelsForDevice_ValidInput) { - size_t maxStackSize = 0; - ncclResult_t result = ncclInitKernelsForDevice(906, 65536, &maxStackSize); +TEST_F(EnqueueTests, ncclInitKernelsForDevice_ValidInput) +{ + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = false; // Continue running all tests + options.verboseLogging = true; - EXPECT_TRUE(result == ncclSuccess); - EXPECT_GT(maxStackSize, 0); + RUN_ISOLATED_TESTS_WITH_OPTIONS( + options, + ProcessIsolatedTestRunner::TestConfig( + "ncclInitKernelsForDevice_ValidInput", + [this]() + { + size_t maxStackSize = 0; + ncclResult_t result = ncclInitKernelsForDevice(906, 65536, &maxStackSize); + + EXPECT_TRUE(result == ncclSuccess); + // maxStackSize should be set to a reasonable value (> 0) + EXPECT_GT(maxStackSize, 0) + << "Expected maxStackSize to be computed and set to a positive value"; + } + ).withEnvironment({{"NCCL_DEBUG", "INFO"}, {"NCCL_DEBUG_SUBSYS", "ALL"}}), + + ProcessIsolatedTestRunner::TestConfig( + "ncclInitKernelsForDevice_ValidInputCarveout", + [this]() + { + size_t maxStackSize = 0; + ncclResult_t result = ncclInitKernelsForDevice(906, 65536, &maxStackSize); + + EXPECT_TRUE(result == ncclSuccess); + // maxStackSize should be set to a reasonable value (> 0) + EXPECT_GT(maxStackSize, 0) + << "Expected maxStackSize to be computed and set to a positive value"; + } + ) + .withEnvironment( + {{"NCCL_L1_SHARED_MEMORY_CARVEOUT", "1"}, + {"NCCL_DEBUG", "INFO"}, + {"NCCL_DEBUG_SUBSYS", "ALL"}} + ) + ); } -TEST_F(EnqueueTests, ncclInitKernelsForDevice_NullStackSize) { - ncclResult_t result = ncclInitKernelsForDevice(906, 65536, nullptr); +TEST_F(EnqueueTests, ncclInitKernelsForDevice_NullStackSize) +{ + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = false; + options.verboseLogging = true; - EXPECT_EQ(result, ncclSuccess); + RUN_ISOLATED_TESTS_WITH_OPTIONS( + options, + ProcessIsolatedTestRunner::TestConfig( + "ncclInitKernelsForDevice_NullStackSize", + []() + { + ncclResult_t result = ncclInitKernelsForDevice(906, 65536, nullptr); + EXPECT_EQ(result, ncclSuccess); + } + ) + ); } -TEST_F(EnqueueTests, ncclInitKernelsForDevice_InvalidArch) { - size_t maxStackSize = 0; - ncclResult_t result = ncclInitKernelsForDevice(-1, 65536, &maxStackSize); - EXPECT_EQ(result, ncclSuccess); +// Test with a real compiled kernel to verify attribute retrieval works correctly +TEST_F(EnqueueTests, KernelAttributes_WithRealKernel) +{ + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = false; + options.verboseLogging = true; + RUN_ISOLATED_TESTS_WITH_OPTIONS( + options, + ProcessIsolatedTestRunner::TestConfig( + "KernelAttributes_WithRealKernel", + []() + { + size_t maxStackSize = 0; + ncclResult_t result = testKernelAttributes((void*)simpleTestKernel, &maxStackSize); + + EXPECT_EQ(result, ncclSuccess) + << "Expected successful kernel attribute retrieval with a real compiled kernel"; + } + ).withEnvironment({{"NCCL_DEBUG", "INFO"}}) + ); } -TEST_F(EnqueueTests, ncclInitKernelsForDevice_ExceedsSharedMemory) { - size_t maxStackSize = 0; +TEST_F(EnqueueTests, ncclInitKernelsForDevice_InvalidArch) +{ + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = false; + options.verboseLogging = true; - ncclResult_t result = ncclInitKernelsForDevice(906, 32832, &maxStackSize); - EXPECT_TRUE(result == ncclSystemError); + RUN_ISOLATED_TESTS_WITH_OPTIONS( + options, + ProcessIsolatedTestRunner::TestConfig( + "ncclInitKernelsForDevice_InvalidArch", + []() + { + size_t maxStackSize = 0; + ncclResult_t result = ncclInitKernelsForDevice(-1, 65536, &maxStackSize); + EXPECT_EQ(result, ncclSuccess); + } + ) + ); +} + +TEST_F(EnqueueTests, ncclInitKernelsForDevice_ExceedsSharedMemory) +{ + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = false; + options.verboseLogging = true; + + RUN_ISOLATED_TESTS_WITH_OPTIONS( + options, + ProcessIsolatedTestRunner::TestConfig( + "ncclInitKernelsForDevice_ExceedsSharedMemory", + []() + { + size_t maxStackSize = 0; + // For gfx906, ncclMaxSharedMem is 32832 (as shown in test output) + // Use a very small maxSharedMem (16000 bytes) to trigger the exceeds check + ncclResult_t result = testKernelSharedMemoryLimit( + (void*)simpleTestKernel, // Use our real compiled kernel + 906, // cudaArch + 16000, // maxSharedMem (intentionally too small) + &maxStackSize, + 32832 // ncclMaxSharedMem for gfx906 + ); + + EXPECT_EQ(result, ncclSystemError) + << "Expected ncclSystemError when ncclMaxSharedMem exceeds maxSharedMem"; + } + ).withEnvironment({{"NCCL_DEBUG", "WARN"}}) + ); } // Test ncclEnqueueCheck function -TEST_F(EnqueueTests, ncclEnqueueCheck_ValidInput) { - ncclResult_t result = ncclEnqueueCheck(info); - EXPECT_TRUE(result == ncclSuccess); +TEST_F(EnqueueTests, ncclEnqueueCheck_ValidInput) +{ + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = false; + options.verboseLogging = true; + + RUN_ISOLATED_TESTS_WITH_OPTIONS( + options, + ProcessIsolatedTestRunner::TestConfig( + "ncclEnqueueCheck_ValidInput", + []() + { + EnqueueTestEnvironment env; + env.setup(); + ncclResult_t result = ncclEnqueueCheck(env.info); + EXPECT_TRUE(result == ncclSuccess); + env.cleanup(); + } + ) + ); } -TEST_F(EnqueueTests, ncclEnqueueCheck_InvalidComm) { - info->comm = nullptr; - ncclResult_t result = ncclEnqueueCheck(info); - EXPECT_EQ(result, ncclInvalidArgument); +TEST_F(EnqueueTests, ncclEnqueueCheck_InvalidComm) +{ + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = false; + options.verboseLogging = true; + + RUN_ISOLATED_TESTS_WITH_OPTIONS( + options, + ProcessIsolatedTestRunner::TestConfig( + "ncclEnqueueCheck_InvalidComm", + []() + { + EnqueueTestEnvironment env; + env.setup(); + env.info->comm = nullptr; + ncclResult_t result = ncclEnqueueCheck(env.info); + EXPECT_EQ(result, ncclInvalidArgument); + env.cleanup(); + } + ) + ); } -TEST_F(EnqueueTests, ncclEnqueueCheck_InvalidBuffers) { - // Test with null sendbuff - comm->checkPointers = 1; - info->sendbuff = nullptr; - ncclResult_t result = ncclEnqueueCheck(info); - EXPECT_EQ(result, ncclInvalidArgument); +TEST_F(EnqueueTests, ncclEnqueueCheck_InvalidBuffers) +{ + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = false; + options.verboseLogging = true; - // Reset sendbuff and test with null recvbuff - info->sendbuff = sendbuff; - info->recvbuff = nullptr; - result = ncclEnqueueCheck(info); - EXPECT_EQ(result, ncclInvalidArgument); + RUN_ISOLATED_TESTS_WITH_OPTIONS( + options, + ProcessIsolatedTestRunner::TestConfig( + "ncclEnqueueCheck_InvalidBuffers", + []() + { + EnqueueTestEnvironment env; + env.setup(); + + // Test with null sendbuff + env.comm->checkPointers = 1; + env.info->sendbuff = nullptr; + ncclResult_t result = ncclEnqueueCheck(env.info); + EXPECT_EQ(result, ncclInvalidArgument); + + // Reset sendbuff and test with null recvbuff + env.info->sendbuff = env.sendbuff; + env.info->recvbuff = nullptr; + result = ncclEnqueueCheck(env.info); + EXPECT_EQ(result, ncclInvalidArgument); + + env.cleanup(); + } + ) + ); } // Test ncclFuncSendCount function -TEST_F(EnqueueTests, ncclFuncSendCount_AllReduce) { - size_t count = 1000; - int nRanks = 4; +TEST_F(EnqueueTests, ncclFuncSendCount_AllTests) +{ + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = false; + options.verboseLogging = true; - size_t result = ncclFuncSendCount(ncclFuncAllReduce, nRanks, count); - EXPECT_EQ(result, count); -} + RUN_ISOLATED_TESTS_WITH_OPTIONS( + options, + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncSendCount_AllReduce", + []() + { + size_t count = 1000; + int nRanks = 4; + size_t result = ncclFuncSendCount(ncclFuncAllReduce, nRanks, count); + EXPECT_EQ(result, count); + } + ), -TEST_F(EnqueueTests, ncclFuncSendCount_Broadcast) { - size_t count = 1000; - int nRanks = 4; + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncSendCount_Broadcast", + []() + { + size_t count = 1000; + int nRanks = 4; + size_t result = ncclFuncSendCount(ncclFuncBroadcast, nRanks, count); + EXPECT_EQ(result, count); + } + ), - size_t result = ncclFuncSendCount(ncclFuncBroadcast, nRanks, count); - EXPECT_EQ(result, count); -} + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncSendCount_Reduce", + []() + { + size_t count = 1000; + int nRanks = 4; + size_t result = ncclFuncSendCount(ncclFuncReduce, nRanks, count); + EXPECT_EQ(result, count); + } + ), -TEST_F(EnqueueTests, ncclFuncSendCount_Reduce) { - size_t count = 1000; - int nRanks = 4; + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncSendCount_AllGather", + []() + { + size_t count = 1000; + int nRanks = 4; + size_t result = ncclFuncSendCount(ncclFuncAllGather, nRanks, count); + EXPECT_EQ(result, count); + } + ), - size_t result = ncclFuncSendCount(ncclFuncReduce, nRanks, count); - EXPECT_EQ(result, count); -} + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncSendCount_ReduceScatter", + []() + { + size_t count = 1000; + int nRanks = 4; + size_t result = ncclFuncSendCount(ncclFuncReduceScatter, nRanks, count); + EXPECT_EQ(result, count * nRanks); + } + ), -TEST_F(EnqueueTests, ncclFuncSendCount_AllGather) { - size_t count = 1000; - int nRanks = 4; - - size_t result = ncclFuncSendCount(ncclFuncAllGather, nRanks, count); - EXPECT_EQ(result, count); -} - -TEST_F(EnqueueTests, ncclFuncSendCount_ReduceScatter) { - size_t count = 1000; - int nRanks = 4; - - size_t result = ncclFuncSendCount(ncclFuncReduceScatter, nRanks, count); - EXPECT_EQ(result, count * nRanks); -} - -TEST_F(EnqueueTests, ncclFuncSendCount_ZeroCount) { - size_t result = ncclFuncSendCount(ncclFuncAllReduce, 4, 0); - EXPECT_EQ(result, 0); + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncSendCount_ZeroCount", + []() + { + size_t result = ncclFuncSendCount(ncclFuncAllReduce, 4, 0); + EXPECT_EQ(result, 0); + } + ) + ); } // Test ncclFuncRecvCount function -TEST_F(EnqueueTests, ncclFuncRecvCount_AllReduce) { - size_t count = 1000; - int nRanks = 4; +TEST_F(EnqueueTests, ncclFuncRecvCount_AllTests) +{ + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = false; + options.verboseLogging = true; - size_t result = ncclFuncRecvCount(ncclFuncAllReduce, nRanks, count); - EXPECT_EQ(result, count); -} + RUN_ISOLATED_TESTS_WITH_OPTIONS( + options, + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncRecvCount_AllReduce", + []() + { + size_t count = 1000; + int nRanks = 4; + size_t result = ncclFuncRecvCount(ncclFuncAllReduce, nRanks, count); + EXPECT_EQ(result, count); + } + ), -TEST_F(EnqueueTests, ncclFuncRecvCount_Broadcast) { - size_t count = 1000; - int nRanks = 4; + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncRecvCount_Broadcast", + []() + { + size_t count = 1000; + int nRanks = 4; + size_t result = ncclFuncRecvCount(ncclFuncBroadcast, nRanks, count); + EXPECT_EQ(result, count); + } + ), - size_t result = ncclFuncRecvCount(ncclFuncBroadcast, nRanks, count); - EXPECT_EQ(result, count); -} + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncRecvCount_Reduce", + []() + { + size_t count = 1000; + int nRanks = 4; + size_t result = ncclFuncRecvCount(ncclFuncReduce, nRanks, count); + EXPECT_EQ(result, count); + } + ), -TEST_F(EnqueueTests, ncclFuncRecvCount_Reduce) { - size_t count = 1000; - int nRanks = 4; + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncRecvCount_AllGather", + []() + { + size_t count = 1000; + int nRanks = 4; + size_t result = ncclFuncRecvCount(ncclFuncAllGather, nRanks, count); + EXPECT_EQ(result, count * nRanks); + } + ), - size_t result = ncclFuncRecvCount(ncclFuncReduce, nRanks, count); - EXPECT_EQ(result, count); -} + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncRecvCount_ReduceScatter", + []() + { + size_t count = 1000; + int nRanks = 4; + size_t result = ncclFuncRecvCount(ncclFuncReduceScatter, nRanks, count); + EXPECT_EQ(result, count); + } + ), -TEST_F(EnqueueTests, ncclFuncRecvCount_AllGather) { - size_t count = 1000; - int nRanks = 4; - - size_t result = ncclFuncRecvCount(ncclFuncAllGather, nRanks, count); - EXPECT_EQ(result, count * nRanks); -} - -TEST_F(EnqueueTests, ncclFuncRecvCount_ReduceScatter) { - size_t count = 1000; - int nRanks = 4; - - size_t result = ncclFuncRecvCount(ncclFuncReduceScatter, nRanks, count); - EXPECT_EQ(result, count); -} - -TEST_F(EnqueueTests, ncclFuncRecvCount_ZeroCount) { - size_t result = ncclFuncRecvCount(ncclFuncAllReduce, 4, 0); - EXPECT_EQ(result, 0); + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncRecvCount_ZeroCount", + []() + { + size_t result = ncclFuncRecvCount(ncclFuncAllReduce, 4, 0); + EXPECT_EQ(result, 0); + } + ) + ); } // Test ncclFuncMaxSendRecvCount function -TEST_F(EnqueueTests, ncclFuncMaxSendRecvCount_AllReduce) { - size_t count = 1000; - int nRanks = 4; +TEST_F(EnqueueTests, ncclFuncMaxSendRecvCount_AllTests) +{ + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = false; + options.verboseLogging = true; - size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllReduce, nRanks, count); - EXPECT_EQ(result, count); -} + RUN_ISOLATED_TESTS_WITH_OPTIONS( + options, + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncMaxSendRecvCount_AllReduce", + []() + { + size_t count = 1000; + int nRanks = 4; + size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllReduce, nRanks, count); + EXPECT_EQ(result, count); + } + ), -TEST_F(EnqueueTests, ncclFuncMaxSendRecvCount_AllGather) { - size_t count = 1000; - int nRanks = 4; + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncMaxSendRecvCount_AllGather", + []() + { + size_t count = 1000; + int nRanks = 4; + size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllGather, nRanks, count); + // For AllGather, receive count (count * nRanks) is larger than send count (count) + EXPECT_EQ(result, count * nRanks); + } + ), - size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllGather, nRanks, count); - // For AllGather, receive count (count * nRanks) is larger than send count (count) - EXPECT_EQ(result, count * nRanks); -} + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncMaxSendRecvCount_ReduceScatter", + []() + { + size_t count = 1000; + int nRanks = 4; + size_t result = ncclFuncMaxSendRecvCount(ncclFuncReduceScatter, nRanks, count); + // For ReduceScatter, send count (count) is larger than receive count (count/nRanks) + EXPECT_EQ(result, count * nRanks); + } + ), -TEST_F(EnqueueTests, ncclFuncMaxSendRecvCount_ReduceScatter) { - size_t count = 1000; - int nRanks = 4; - - size_t result = ncclFuncMaxSendRecvCount(ncclFuncReduceScatter, nRanks, count); - // For ReduceScatter, send count (count) is larger than receive count (count/nRanks) - EXPECT_EQ(result, count * nRanks); -} - -TEST_F(EnqueueTests, ncclFuncMaxSendRecvCount_ZeroCount) { - size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllReduce, 4, 0); - EXPECT_EQ(result, 0); + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncMaxSendRecvCount_ZeroCount", + []() + { + size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllReduce, 4, 0); + EXPECT_EQ(result, 0); + } + ) + ); } // Edge case tests -TEST_F(EnqueueTests, ncclFuncCounts_SingleRank) { - size_t count = 1000; - int nRanks = 1; +TEST_F(EnqueueTests, ncclFuncCounts_EdgeCases) +{ + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = false; + options.verboseLogging = true; - // Test with single rank - EXPECT_EQ(ncclFuncSendCount(ncclFuncAllReduce, nRanks, count), count); - EXPECT_EQ(ncclFuncRecvCount(ncclFuncAllReduce, nRanks, count), count); - EXPECT_EQ(ncclFuncMaxSendRecvCount(ncclFuncAllReduce, nRanks, count), count); + RUN_ISOLATED_TESTS_WITH_OPTIONS( + options, + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncCounts_SingleRank", + []() + { + size_t count = 1000; + int nRanks = 1; + // Test with single rank + EXPECT_EQ(ncclFuncSendCount(ncclFuncAllReduce, nRanks, count), count); + EXPECT_EQ(ncclFuncRecvCount(ncclFuncAllReduce, nRanks, count), count); + EXPECT_EQ(ncclFuncMaxSendRecvCount(ncclFuncAllReduce, nRanks, count), count); + } + ), + + ProcessIsolatedTestRunner::TestConfig( + "ncclFuncCounts_LargeRankCount", + []() + { + size_t count = 1000; + int nRanks = 1024; + // Test with large number of ranks + EXPECT_EQ(ncclFuncSendCount(ncclFuncAllGather, nRanks, count), count); + EXPECT_EQ(ncclFuncRecvCount(ncclFuncAllGather, nRanks, count), count * nRanks); + EXPECT_EQ( + ncclFuncMaxSendRecvCount(ncclFuncAllGather, nRanks, count), + count * nRanks + ); + } + ) + ); } -TEST_F(EnqueueTests, ncclFuncCounts_LargeRankCount) { - size_t count = 1000; - int nRanks = 1024; - - // Test with large number of ranks - EXPECT_EQ(ncclFuncSendCount(ncclFuncAllGather, nRanks, count), count); - EXPECT_EQ(ncclFuncRecvCount(ncclFuncAllGather, nRanks, count), count * nRanks); - EXPECT_EQ(ncclFuncMaxSendRecvCount(ncclFuncAllGather, nRanks, count), count * nRanks); -} +} // namespace RcclUnitTesting \ No newline at end of file diff --git a/projects/rccl/test/NetSocketTests.cpp b/projects/rccl/test/NetSocketTests.cpp index f02bd234ae..2d4a1897ef 100644 --- a/projects/rccl/test/NetSocketTests.cpp +++ b/projects/rccl/test/NetSocketTests.cpp @@ -3,8 +3,8 @@ * * See LICENSE.txt for license information ************************************************************************/ - #include "net.h" +#include "common/ProcessIsolatedTestRunner.hpp" #include "gtest/gtest.h" #include #include @@ -612,6 +612,169 @@ protected: return static_cast(result); } + void RunConcurrentOperationsTaskCreationWithEnvVars() { + INFO(NCCL_LOG_INFO, "Checking socket configuration environment variables"); + + // Check if the required environment variables are set + const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS"); + const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD"); + + if (!nThreadsEnv || !nSocksPerThreadEnv) { + GTEST_SKIP() << "SKIPPING TEST: Required environment variables not set. " + << "Please set the following environment variables to run this test: " + << "export NCCL_SOCKET_NTHREADS=1 and export NCCL_NSOCKS_PERTHREAD=2. " + << "This ensures nSocks > 0 so that ncclNetSocketGetTask gets called. " + << "Environment variables NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be set"; + return; + } + + int nThreads = ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1); + int nSocksPerThread = ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1); + + // Additional validation for reasonable upper bounds + const int MAX_THREADS = 16; + const int MAX_SOCKS_PER_THREAD = 64; + const int MAX_TOTAL_SOCKETS = 64; + + if (nThreads > MAX_THREADS) { + GTEST_SKIP() << "SKIPPING TEST: NCCL_SOCKET_NTHREADS=" << nThreads << " exceeds maximum " << MAX_THREADS << ". " + << "Please provide a reasonable value (e.g., NCCL_SOCKET_NTHREADS=8). " + << "Values too large may cause resource exhaustion."; + return; + } + + if (nSocksPerThread > MAX_SOCKS_PER_THREAD) { + GTEST_SKIP() << "SKIPPING TEST: NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread << " exceeds maximum " << MAX_SOCKS_PER_THREAD << ". " + << "Please provide a reasonable value (e.g., NCCL_NSOCKS_PERTHREAD=4). " + << "Values too large may cause resource exhaustion."; + return; + } + + // Check for potential overflow before multiplication + if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) { + GTEST_SKIP() << "SKIPPING TEST: Configuration would cause integer overflow. " + << "NCCL_SOCKET_NTHREADS=" << nThreads << " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread + << " exceeds maximum integer value. Please use smaller values."; + return; + } + + int totalSockets = nThreads * nSocksPerThread; + + INFO(NCCL_LOG_INFO, "Environment configuration found:"); + INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads); + INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread); + INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets); + + // Validate total sockets count + if (totalSockets <= 0) { + GTEST_SKIP() << "SKIPPING TEST: Invalid configuration - total sockets must be > 0. " + << "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread + << " = " << totalSockets << ". " + << "Both NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be positive integers. " + << "Example: export NCCL_SOCKET_NTHREADS=2 && export NCCL_NSOCKS_PERTHREAD=2"; + return; + } + + if (totalSockets > MAX_TOTAL_SOCKETS) { + GTEST_SKIP() << "SKIPPING TEST: Total sockets " << totalSockets << " exceeds maximum " << MAX_TOTAL_SOCKETS << ". " + << "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread + << " = " << totalSockets << ". " + << "Please reduce either NCCL_SOCKET_NTHREADS or NCCL_NSOCKS_PERTHREAD. " + << "Example: export NCCL_SOCKET_NTHREADS=8 && export NCCL_NSOCKS_PERTHREAD=4"; + return; + } + + if (totalSockets > NCCL_NET_MAX_REQUESTS) { + GTEST_SKIP() << "SKIPPING TEST: Total sockets " << totalSockets << " exceeds NCCL_NET_MAX_REQUESTS=" << NCCL_NET_MAX_REQUESTS << ". " + << "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread + << " = " << totalSockets << ". " + << "NCCL network layer can handle at most " << NCCL_NET_MAX_REQUESTS << " concurrent requests. " + << "Please reduce configuration to stay within NCCL limits."; + return; + } + + INFO(NCCL_LOG_INFO, "Configuration valid - proceeding with test to exercise " + "ncclNetSocketGetTask"); + + // Test socket properties + TestSocketProperties(); + + char handle[NCCL_NET_HANDLE_MAXSIZE]; + void *listenComm = nullptr; + + ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm); + ASSERT_EQ(result, ncclSuccess) << "Failed to establish listening socket for test execution. " + << "ncclNetSocket.listen() returned error code: " << result + << ". Verify network device availability and port accessibility."; + + INFO(NCCL_LOG_INFO, "Testing task creation functionality - ensuring " + "ncclNetSocketGetTask is called"); + + std::vector sendComms; + std::vector recvComms; + + // Establish connection + void *sendComm = nullptr; + void *recvComm = nullptr; + bool connectionSuccess = + EstablishConnectionPair(handle, listenComm, sendComm, recvComm); + + if (connectionSuccess) { + sendComms.push_back(sendComm); + recvComms.push_back(recvComm); + + // Test with buffer sizes that will trigger task subdivision + std::vector testSizes = GetTestSizes(); + + for (size_t testSize : testSizes) { + INFO(NCCL_LOG_INFO, + "\n=== Testing with buffer size: %zu bytes ===", testSize); + INFO(NCCL_LOG_INFO, "This should trigger ncclNetSocketGetTask to create " + "task subdivision"); + + std::vector sendMhandles; + std::vector recvMhandles; + std::vector sendRequests; + std::vector recvRequests; + std::vector> sendBuffers; + std::vector> recvBuffers; + + // Setup operations for this test size + bool setupSuccess = SetupOperationsForSize( + sendComm, recvComm, testSize, sendBuffers, recvBuffers, sendMhandles, + recvMhandles, sendRequests, recvRequests, 0xAB); + + if (setupSuccess) { + // Progress operations with context about environment variables + ProgressOperations(sendRequests[0], recvRequests[0], testSize, + " (with nSocks > 0 from environment variables)"); + } else { + INFO(NCCL_LOG_INFO, + "No operations started - skipping progress testing for size %zu", + testSize); + } + + // Deregister memory + DeregisterMemory(sendComm, recvComm, sendMhandles, recvMhandles, + testSize); + + INFO(NCCL_LOG_INFO, + "=== Completed testing for buffer size: %zu bytes ===", testSize); + } + + INFO(NCCL_LOG_INFO, "\n*** TEST SUCCESS: ncclNetSocketGetTask was " + "successfully exercised! ***"); + } else { + INFO(NCCL_LOG_INFO, "No connections established - test passed (network may " + "not be available)"); + } + + // Cleanup + CleanupCommunicators(sendComms, recvComms, listenComm); + INFO(NCCL_LOG_INFO, + "TestConcurrentOperationsTaskCreation completed successfully"); + } + }; // Test concurrent operations task creation in default configuration (without @@ -709,166 +872,19 @@ TEST_F(NetSocketTests, TestConcurrentOperationsTaskCreationDefault) { // Test multiple concurrent operations to stress test task creation TEST_F(NetSocketTests, TestConcurrentOperationsTaskCreation) { - INFO(NCCL_LOG_INFO, "Checking socket configuration environment variables"); + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = false; // Continue running all tests + options.verboseLogging = true; - // Check if the required environment variables are set - const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS"); - const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD"); - - if (!nThreadsEnv || !nSocksPerThreadEnv) { - GTEST_SKIP() << "SKIPPING TEST: Required environment variables not set. " - << "Please set the following environment variables to run this test: " - << "export NCCL_SOCKET_NTHREADS=1 and export NCCL_NSOCKS_PERTHREAD=2. " - << "This ensures nSocks > 0 so that ncclNetSocketGetTask gets called. " - << "Environment variables NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be set"; - return; - } - - int nThreads = ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1); - int nSocksPerThread = ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1); - - // Additional validation for reasonable upper bounds - const int MAX_THREADS = 16; - const int MAX_SOCKS_PER_THREAD = 64; - const int MAX_TOTAL_SOCKETS = 64; - - if (nThreads > MAX_THREADS) { - GTEST_SKIP() << "SKIPPING TEST: NCCL_SOCKET_NTHREADS=" << nThreads << " exceeds maximum " << MAX_THREADS << ". " - << "Please provide a reasonable value (e.g., NCCL_SOCKET_NTHREADS=8). " - << "Values too large may cause resource exhaustion."; - return; - } - - if (nSocksPerThread > MAX_SOCKS_PER_THREAD) { - GTEST_SKIP() << "SKIPPING TEST: NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread << " exceeds maximum " << MAX_SOCKS_PER_THREAD << ". " - << "Please provide a reasonable value (e.g., NCCL_NSOCKS_PERTHREAD=4). " - << "Values too large may cause resource exhaustion."; - return; - } - - // Check for potential overflow before multiplication - if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) { - GTEST_SKIP() << "SKIPPING TEST: Configuration would cause integer overflow. " - << "NCCL_SOCKET_NTHREADS=" << nThreads << " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread - << " exceeds maximum integer value. Please use smaller values."; - return; - } - - int totalSockets = nThreads * nSocksPerThread; - - INFO(NCCL_LOG_INFO, "Environment configuration found:"); - INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads); - INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread); - INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets); - - // Validate total sockets count - if (totalSockets <= 0) { - GTEST_SKIP() << "SKIPPING TEST: Invalid configuration - total sockets must be > 0. " - << "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread - << " = " << totalSockets << ". " - << "Both NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be positive integers. " - << "Example: export NCCL_SOCKET_NTHREADS=2 && export NCCL_NSOCKS_PERTHREAD=2"; - return; - } - - if (totalSockets > MAX_TOTAL_SOCKETS) { - GTEST_SKIP() << "SKIPPING TEST: Total sockets " << totalSockets << " exceeds maximum " << MAX_TOTAL_SOCKETS << ". " - << "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread - << " = " << totalSockets << ". " - << "Please reduce either NCCL_SOCKET_NTHREADS or NCCL_NSOCKS_PERTHREAD. " - << "Example: export NCCL_SOCKET_NTHREADS=8 && export NCCL_NSOCKS_PERTHREAD=4"; - return; - } - - if (totalSockets > NCCL_NET_MAX_REQUESTS) { - GTEST_SKIP() << "SKIPPING TEST: Total sockets " << totalSockets << " exceeds NCCL_NET_MAX_REQUESTS=" << NCCL_NET_MAX_REQUESTS << ". " - << "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread - << " = " << totalSockets << ". " - << "NCCL network layer can handle at most " << NCCL_NET_MAX_REQUESTS << " concurrent requests. " - << "Please reduce configuration to stay within NCCL limits."; - return; - } - - INFO(NCCL_LOG_INFO, "Configuration valid - proceeding with test to exercise " - "ncclNetSocketGetTask"); - - // Test socket properties - TestSocketProperties(); - - char handle[NCCL_NET_HANDLE_MAXSIZE]; - void *listenComm = nullptr; - - ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm); - ASSERT_EQ(result, ncclSuccess) << "Failed to establish listening socket for test execution. " - << "ncclNetSocket.listen() returned error code: " << result - << ". Verify network device availability and port accessibility."; - - INFO(NCCL_LOG_INFO, "Testing task creation functionality - ensuring " - "ncclNetSocketGetTask is called"); - - std::vector sendComms; - std::vector recvComms; - - // Establish connection - void *sendComm = nullptr; - void *recvComm = nullptr; - bool connectionSuccess = - EstablishConnectionPair(handle, listenComm, sendComm, recvComm); - - if (connectionSuccess) { - sendComms.push_back(sendComm); - recvComms.push_back(recvComm); - - // Test with buffer sizes that will trigger task subdivision - std::vector testSizes = GetTestSizes(); - - for (size_t testSize : testSizes) { - INFO(NCCL_LOG_INFO, - "\n=== Testing with buffer size: %zu bytes ===", testSize); - INFO(NCCL_LOG_INFO, "This should trigger ncclNetSocketGetTask to create " - "task subdivision"); - - std::vector sendMhandles; - std::vector recvMhandles; - std::vector sendRequests; - std::vector recvRequests; - std::vector> sendBuffers; - std::vector> recvBuffers; - - // Setup operations for this test size - bool setupSuccess = SetupOperationsForSize( - sendComm, recvComm, testSize, sendBuffers, recvBuffers, sendMhandles, - recvMhandles, sendRequests, recvRequests, 0xAB); - - if (setupSuccess) { - // Progress operations with context about environment variables - ProgressOperations(sendRequests[0], recvRequests[0], testSize, - " (with nSocks > 0 from environment variables)"); - } else { - INFO(NCCL_LOG_INFO, - "No operations started - skipping progress testing for size %zu", - testSize); - } - - // Deregister memory - DeregisterMemory(sendComm, recvComm, sendMhandles, recvMhandles, - testSize); - - INFO(NCCL_LOG_INFO, - "=== Completed testing for buffer size: %zu bytes ===", testSize); - } - - INFO(NCCL_LOG_INFO, "\n*** TEST SUCCESS: ncclNetSocketGetTask was " - "successfully exercised! ***"); - } else { - INFO(NCCL_LOG_INFO, "No connections established - test passed (network may " - "not be available)"); - } - - // Cleanup - CleanupCommunicators(sendComms, recvComms, listenComm); - INFO(NCCL_LOG_INFO, - "TestConcurrentOperationsTaskCreation completed successfully"); + RUN_ISOLATED_TESTS_WITH_OPTIONS(options, + ProcessIsolatedTestRunner::TestConfig( + "TestConcurrentOperationsTaskCreation", + [this]() { RunConcurrentOperationsTaskCreationWithEnvVars(); }) + .withEnvironment({{"NCCL_SOCKET_NTHREADS", "1"}, + {"NCCL_NSOCKS_PERTHREAD", "2"}, + {"NCCL_DEBUG", "TRACE"}, + {"NCCL_DEBUG_SUBSYS", "ALL"}}) + ); } // Test for invalid device index in listen function @@ -1079,158 +1095,239 @@ TEST_F(NetSocketTests, TestNonHostMemoryRegMr) { // Test for excessive thread configuration warning TEST_F(NetSocketTests, TestExcessiveThreadConfig) { - INFO(NCCL_LOG_INFO, "Testing excessive thread configuration warning"); + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = false; // Continue running all tests + options.verboseLogging = true; - // Check if the required environment variables are set - const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS"); - const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD"); + RUN_ISOLATED_TESTS_WITH_OPTIONS(options, + ProcessIsolatedTestRunner::TestConfig( + "TestExcessiveThreadConfig", + [this]() { + INFO(NCCL_LOG_INFO, + "Testing excessive thread configuration warning"); - if (!nThreadsEnv || !nSocksPerThreadEnv) { - GTEST_SKIP() << "SKIPPING TEST: Required environment variables not set. " - << "This test requires NCCL_SOCKET_NTHREADS > NCCL_NET_MAX_REQUESTS (" << NCCL_NET_MAX_REQUESTS << ") and NCCL_NSOCKS_PERTHREAD = 1 to trigger warning. " - << "Environment variables NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be set"; - return; - } + // Check if the required environment variables are set + const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS"); + const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD"); - // Parse with validation - both must be positive - int nThreads = ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1); - int nSocksPerThread = ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1); + if (!nThreadsEnv || !nSocksPerThreadEnv) { + GTEST_SKIP() + << "SKIPPING TEST: Required environment variables not set. " + << "This test requires NCCL_SOCKET_NTHREADS > " + "NCCL_NET_MAX_REQUESTS (" + << NCCL_NET_MAX_REQUESTS + << ") and NCCL_NSOCKS_PERTHREAD = 1 to trigger warning. " + << "Environment variables NCCL_SOCKET_NTHREADS and " + "NCCL_NSOCKS_PERTHREAD must be set"; + return; + } - // Check for potential overflow before multiplication - if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) { - GTEST_SKIP() << "SKIPPING TEST: Configuration would cause integer overflow. " - << "NCCL_SOCKET_NTHREADS=" << nThreads << " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread - << " exceeds maximum integer value. Please use smaller values."; - return; - } + // Parse with validation - both must be positive + int nThreads = + ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1); + int nSocksPerThread = + ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1); - int totalSockets = nThreads * nSocksPerThread; + // Check for potential overflow before multiplication + if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) { + GTEST_SKIP() << "SKIPPING TEST: Configuration would cause " + "integer overflow. " + << "NCCL_SOCKET_NTHREADS=" << nThreads + << " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread + << " exceeds maximum integer value. Please use " + "smaller values."; + return; + } - INFO(NCCL_LOG_INFO, "Environment configuration found:"); - INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads); - INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread); - INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets); + int totalSockets = nThreads * nSocksPerThread; - // Check if configuration is set to trigger the excessive threads warning - // Use NCCL_NET_MAX_REQUESTS instead of arbitrary MAX_THREADS - if (nThreads <= NCCL_NET_MAX_REQUESTS) { - GTEST_SKIP() << "SKIPPING TEST: NCCL_SOCKET_NTHREADS must be > " << NCCL_NET_MAX_REQUESTS << " to test excessive thread warning. " - << "Current NCCL_SOCKET_NTHREADS=" << nThreads << ". " - << "Please set: export NCCL_SOCKET_NTHREADS=" << (NCCL_NET_MAX_REQUESTS + 1) << ". " - << "NCCL_SOCKET_NTHREADS must be > NCCL_NET_MAX_REQUESTS (" << NCCL_NET_MAX_REQUESTS << ") to trigger warning"; - return; - } + INFO(NCCL_LOG_INFO, "Environment configuration found:"); + INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads); + INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread); + INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets); - if (totalSockets > NCCL_NET_MAX_REQUESTS * 10) { // Allow 10x for testing excessive config - GTEST_SKIP() << "SKIPPING TEST: Total sockets=" << totalSockets << " is unreasonably large (> " << (NCCL_NET_MAX_REQUESTS * 10) << "). " - << "Please use more reasonable values for testing. NCCL_NET_MAX_REQUESTS=" << NCCL_NET_MAX_REQUESTS << ". " - << "Example: export NCCL_SOCKET_NTHREADS=" << (NCCL_NET_MAX_REQUESTS + 1) << " && export NCCL_NSOCKS_PERTHREAD=1"; - return; - } + // Check if configuration is set to trigger the excessive threads + // warning Use NCCL_NET_MAX_REQUESTS instead of arbitrary + // MAX_THREADS + if (nThreads <= NCCL_NET_MAX_REQUESTS) { + GTEST_SKIP() + << "SKIPPING TEST: NCCL_SOCKET_NTHREADS must be > " + << NCCL_NET_MAX_REQUESTS + << " to test excessive thread warning. " + << "Current NCCL_SOCKET_NTHREADS=" << nThreads << ". " + << "Please set: export NCCL_SOCKET_NTHREADS=" + << (NCCL_NET_MAX_REQUESTS + 1) << ". " + << "NCCL_SOCKET_NTHREADS must be > NCCL_NET_MAX_REQUESTS (" + << NCCL_NET_MAX_REQUESTS << ") to trigger warning"; + return; + } - INFO(NCCL_LOG_INFO, - "Configuration valid for testing excessive threads warning"); - INFO(NCCL_LOG_INFO, "NCCL_SOCKET_NTHREADS=%d > NCCL_NET_MAX_REQUESTS=%d", nThreads, NCCL_NET_MAX_REQUESTS); + if (totalSockets > + NCCL_NET_MAX_REQUESTS * + 10) { // Allow 10x for testing excessive config + GTEST_SKIP() << "SKIPPING TEST: Total sockets=" << totalSockets + << " is unreasonably large (> " + << (NCCL_NET_MAX_REQUESTS * 10) << "). " + << "Please use more reasonable values for testing. " + "NCCL_NET_MAX_REQUESTS=" + << NCCL_NET_MAX_REQUESTS << ". " + << "Example: export NCCL_SOCKET_NTHREADS=" + << (NCCL_NET_MAX_REQUESTS + 1) + << " && export NCCL_NSOCKS_PERTHREAD=1"; + return; + } - // Test socket properties - TestSocketProperties(); + INFO(NCCL_LOG_INFO, + "Configuration valid for testing excessive threads warning"); + INFO(NCCL_LOG_INFO, + "NCCL_SOCKET_NTHREADS=%d > NCCL_NET_MAX_REQUESTS=%d", nThreads, + NCCL_NET_MAX_REQUESTS); - // Initialize to trigger the warning logic - char handle[NCCL_NET_HANDLE_MAXSIZE]; - void *listenComm = nullptr; - ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm); + // Test socket properties + TestSocketProperties(); - if (result == ncclSuccess && listenComm) { - // The implementation should have limited the threads to NCCL_NET_MAX_REQUESTS - // internally - INFO(NCCL_LOG_INFO, - "*** SUCCESS: Listen succeeded with excessive NCCL_SOCKET_NTHREADS - " - "limits enforced internally ***"); - ncclNetSocket.closeListen(listenComm); - } else { - INFO(NCCL_LOG_INFO, "Listen failed with result: %d", result); - } + // Initialize to trigger the warning logic + char handle[NCCL_NET_HANDLE_MAXSIZE]; + void *listenComm = nullptr; + ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm); - INFO(NCCL_LOG_INFO, "TestExcessiveThreadConfig completed"); + if (result == ncclSuccess && listenComm) { + // The implementation should have limited the threads to + // NCCL_NET_MAX_REQUESTS internally + INFO(NCCL_LOG_INFO, "*** SUCCESS: Listen succeeded with " + "excessive NCCL_SOCKET_NTHREADS - " + "limits enforced internally ***"); + ncclNetSocket.closeListen(listenComm); + } else { + INFO(NCCL_LOG_INFO, "Listen failed with result: %d", result); + } + + INFO(NCCL_LOG_INFO, "TestExcessiveThreadConfig completed"); + }) + .withEnvironment({{"NCCL_SOCKET_NTHREADS", "33"}, + {"NCCL_NSOCKS_PERTHREAD", "1"}, + {"NCCL_DEBUG", "TRACE"}, + {"NCCL_DEBUG_SUBSYS", "ALL"}}) + ); } // Test for excessive socket configuration warning TEST_F(NetSocketTests, TestExcessiveSocketConfig) { - INFO(NCCL_LOG_INFO, "Testing excessive socket configuration warning"); + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = false; // Continue running all tests + options.verboseLogging = true; - // Check if the required environment variables are set - const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS"); - const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD"); + RUN_ISOLATED_TESTS_WITH_OPTIONS(options, + ProcessIsolatedTestRunner::TestConfig( + "TestExcessiveThreadConfig", + [this]() { + INFO(NCCL_LOG_INFO, + "Testing excessive socket configuration warning"); - if (!nThreadsEnv || !nSocksPerThreadEnv) { - GTEST_SKIP() << "SKIPPING TEST: Required environment variables not set. " - << "This test requires total sockets (nThreads * nSocksPerThread) > MAX_SOCKETS (64). " - << "Environment variables NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be set"; - return; - } + // Check if the required environment variables are set + const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS"); + const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD"); - // Parse with validation - both must be positive - int nThreads = ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1); - int nSocksPerThread = ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1); + if (!nThreadsEnv || !nSocksPerThreadEnv) { + GTEST_SKIP() + << "SKIPPING TEST: Required environment variables not set. " + << "This test requires total sockets (nThreads * " + "nSocksPerThread) > MAX_SOCKETS (64). " + << "Environment variables NCCL_SOCKET_NTHREADS and " + "NCCL_NSOCKS_PERTHREAD must be set"; + return; + } - // Check for potential overflow before multiplication - if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) { - GTEST_SKIP() << "SKIPPING TEST: Configuration would cause integer overflow. " - << "NCCL_SOCKET_NTHREADS=" << nThreads << " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread - << " exceeds maximum integer value. Please use smaller values."; - return; - } + // Parse with validation - both must be positive + int nThreads = + ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1); + int nSocksPerThread = + ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1); - int totalSockets = nThreads * nSocksPerThread; + // Check for potential overflow before multiplication + if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) { + GTEST_SKIP() << "SKIPPING TEST: Configuration would cause " + "integer overflow. " + << "NCCL_SOCKET_NTHREADS=" << nThreads + << " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread + << " exceeds maximum integer value. Please use " + "smaller values."; + return; + } - INFO(NCCL_LOG_INFO, "Environment configuration found:"); - INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads); - INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread); - INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets); + int totalSockets = nThreads * nSocksPerThread; - // Check if configuration is set to trigger the excessive sockets warning - const int MAX_SOCKETS = 64; - if (totalSockets <= MAX_SOCKETS) { - GTEST_SKIP() << "SKIPPING TEST: Total sockets must be > " << MAX_SOCKETS << " to test excessive socket warning. " - << "Current total sockets=" << totalSockets - << " (nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread << "). " - << "Please set environment variables such that total > " << MAX_SOCKETS << ", e.g.: " - << "export NCCL_SOCKET_NTHREADS=9 && export NCCL_NSOCKS_PERTHREAD=8. " - << "Total sockets must be > MAX_SOCKETS (" << MAX_SOCKETS << ") to trigger warning"; - return; - } + INFO(NCCL_LOG_INFO, "Environment configuration found:"); + INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads); + INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread); + INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets); - // Additional validation against NCCL_NET_MAX_REQUESTS for reasonable upper bounds - if (totalSockets > NCCL_NET_MAX_REQUESTS * 10) { // Allow 10x for testing excessive config - GTEST_SKIP() << "SKIPPING TEST: Total sockets=" << totalSockets << " is unreasonably large (> " << (NCCL_NET_MAX_REQUESTS * 10) << "). " - << "Please use more reasonable values for testing. NCCL_NET_MAX_REQUESTS=" << NCCL_NET_MAX_REQUESTS << ". " - << "Example: export NCCL_SOCKET_NTHREADS=10 && export NCCL_NSOCKS_PERTHREAD=10"; - return; - } + // Check if configuration is set to trigger the excessive sockets + // warning + const int MAX_SOCKETS = 64; + if (totalSockets <= MAX_SOCKETS) { + GTEST_SKIP() + << "SKIPPING TEST: Total sockets must be > " << MAX_SOCKETS + << " to test excessive socket warning. " + << "Current total sockets=" << totalSockets + << " (nThreads=" << nThreads + << " * nSocksPerThread=" << nSocksPerThread << "). " + << "Please set environment variables such that total > " + << MAX_SOCKETS << ", e.g.: " + << "export NCCL_SOCKET_NTHREADS=9 && export " + "NCCL_NSOCKS_PERTHREAD=8. " + << "Total sockets must be > MAX_SOCKETS (" << MAX_SOCKETS + << ") to trigger warning"; + return; + } - INFO(NCCL_LOG_INFO, - "Configuration valid for testing excessive sockets warning"); - INFO(NCCL_LOG_INFO, "Total sockets=%d > MAX_SOCKETS=64", totalSockets); + // Additional validation against NCCL_NET_MAX_REQUESTS for + // reasonable upper bounds + if (totalSockets > + NCCL_NET_MAX_REQUESTS * + 10) { // Allow 10x for testing excessive config + GTEST_SKIP() << "SKIPPING TEST: Total sockets=" << totalSockets + << " is unreasonably large (> " + << (NCCL_NET_MAX_REQUESTS * 10) << "). " + << "Please use more reasonable values for testing. " + "NCCL_NET_MAX_REQUESTS=" + << NCCL_NET_MAX_REQUESTS << ". " + << "Example: export NCCL_SOCKET_NTHREADS=10 && " + "export NCCL_NSOCKS_PERTHREAD=10"; + return; + } - // Test socket properties - TestSocketProperties(); + INFO(NCCL_LOG_INFO, + "Configuration valid for testing excessive sockets warning"); + INFO(NCCL_LOG_INFO, "Total sockets=%d > MAX_SOCKETS=64", + totalSockets); - // Initialize to trigger the warning logic - char handle[NCCL_NET_HANDLE_MAXSIZE]; - void *listenComm = nullptr; - ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm); + // Test socket properties + TestSocketProperties(); - if (result == ncclSuccess && listenComm) { - // The implementation should have limited the sockets to MAX_SOCKETS - // internally - INFO(NCCL_LOG_INFO, "*** SUCCESS: Listen succeeded with excessive total " - "sockets - limits enforced internally ***"); - ncclNetSocket.closeListen(listenComm); - } else { - INFO(NCCL_LOG_INFO, "Listen failed with result: %d", result); - } + // Initialize to trigger the warning logic + char handle[NCCL_NET_HANDLE_MAXSIZE]; + void *listenComm = nullptr; + ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm); - INFO(NCCL_LOG_INFO, "TestExcessiveSocketConfig completed"); + if (result == ncclSuccess && listenComm) { + // The implementation should have limited the sockets to + // MAX_SOCKETS internally + INFO(NCCL_LOG_INFO, + "*** SUCCESS: Listen succeeded with excessive total " + "sockets - limits enforced internally ***"); + ncclNetSocket.closeListen(listenComm); + } else { + INFO(NCCL_LOG_INFO, "Listen failed with result: %d", result); + } + + INFO(NCCL_LOG_INFO, "TestExcessiveSocketConfig completed"); + }) + .withEnvironment({{"NCCL_SOCKET_NTHREADS", "10"}, + {"NCCL_NSOCKS_PERTHREAD", "10"}, + {"NCCL_DEBUG", "TRACE"}, + {"NCCL_DEBUG_SUBSYS", "ALL"}}) + ); } // Test to trigger request allocation failure scenario diff --git a/projects/rccl/test/ProxyTests.cpp b/projects/rccl/test/ProxyTests.cpp index 5c8cf06bc5..1ac2ea8f4f 100644 --- a/projects/rccl/test/ProxyTests.cpp +++ b/projects/rccl/test/ProxyTests.cpp @@ -3,20 +3,14 @@ * * See LICENSE.txt for license information ************************************************************************/ -#include "gtest/gtest.h" - #include "collectives.h" #include "comm.h" +#include "gtest/gtest.h" #include "info.h" #include "profiler.h" #include "shmutils.h" #include "socket.h" #define ENABLE_TIMER 0 -#include "profiler.h" -#include "proxy.h" -#include "timer.h" -#include "transport.h" - #include #include #include @@ -25,409 +19,467 @@ #include #include +#include "common/ErrCode.hpp" +#include "common/ProcessIsolatedTestRunner.hpp" +#include "profiler.h" +#include "proxy.h" +#include "timer.h" +#include "transport.h" + #define NCCL_MAX_OPS (2048) #define OP_INDEX(op) ((op) ? (op) - state->pools->elems : -1) #define OP_SEEN 0x100000 -ncclResult_t getOpIndex(struct ncclProxyArgs *op, - struct ncclProxyProgressState *state, int *poolIndex, - int *opIndex); -ncclResult_t dumpProxyState(struct ncclProxyProgressState *state); -ncclResult_t printProxyOp(struct ncclProxyArgs *op, int poolIndex, int opIndex); -ncclResult_t dumpProxyState(struct ncclProxyProgressState *state); -ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm *comm, - struct ncclProxyConnector *proxyConn, - int type, void *reqBuff, int reqSize, - void *respBuff, int respSize, int *reqFd, - int *respFd); -ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm *comm, int proxyRank, - void *handle, int *convertedFd); -ncclResult_t -ncclProxyClientQueryFdBlocking(struct ncclComm *comm, - struct ncclProxyConnector *proxyConn, - int localFd, int *rmtFd); +ncclResult_t getOpIndex( + struct ncclProxyArgs* op, struct ncclProxyProgressState* state, int* poolIndex, int* opIndex +); +ncclResult_t dumpProxyState(struct ncclProxyProgressState* state); +ncclResult_t printProxyOp(struct ncclProxyArgs* op, int poolIndex, int opIndex); +ncclResult_t dumpProxyState(struct ncclProxyProgressState* state); +ncclResult_t ncclProxyCallBlockingUDS( + struct ncclComm* comm, + struct ncclProxyConnector* proxyConn, + int type, + void* reqBuff, + int reqSize, + void* respBuff, + int respSize, + int* reqFd, + int* respFd +); +ncclResult_t ncclProxyClientGetFdBlocking( + struct ncclComm* comm, int proxyRank, void* handle, int* convertedFd +); +ncclResult_t ncclProxyClientQueryFdBlocking( + struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int localFd, int* rmtFd +); void ncclDumpProxyState(int signal); #define PROXYARGS_ALLOCATE_SIZE NCCL_MAX_OPS -struct ncclProxyPool { - struct ncclProxyPool *next; - struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE]; + +struct ncclProxyPool +{ + struct ncclProxyPool* next; + struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE]; }; -void init_ncclProxyArgs_struct(ncclProxyArgs *pool_ptr) { - // init pool_ptr - pool_ptr->send = 2; - pool_ptr->nextRank = 4; - pool_ptr->prevRank = 5; - pool_ptr->pattern = ncclPatternRing; - pool_ptr->nsubs = 1; - pool_ptr->state = ncclProxyOpNone; - pool_ptr->retry_total = 2; +void init_ncclProxyArgs_struct(ncclProxyArgs* pool_ptr) +{ + // init pool_ptr + pool_ptr->send = 2; + pool_ptr->nextRank = 4; + pool_ptr->prevRank = 5; + pool_ptr->pattern = ncclPatternRing; + pool_ptr->nsubs = 1; + pool_ptr->state = ncclProxyOpNone; + pool_ptr->retry_total = 2; } -namespace RcclUnitTesting { -TEST(ProxyTests, - getOpIndex) { // Tests what is the index of the pool being passed within - // the known valid pools in state ptr - INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n"); - // Init Dummy structs - struct ncclProxyArgs *pool_ptr = new ncclProxyArgs; - struct ncclProxyPool *pools_ptr = new ncclProxyPool; - struct ncclProxyPool *pools2_ptr = new ncclProxyPool; - struct ncclProxyProgressState *state_ptr = new ncclProxyProgressState; +namespace RcclUnitTesting +{ +TEST(ProxyTests, getOpIndex) +{ // Tests what is the index of the pool being passed within + // the known valid pools in state ptr + INFO("[ProxyTests] Test Start \n"); - // state_ptr = &state; - state_ptr->active = &pools_ptr->elems[1]; // chk - state_ptr->pool = pool_ptr; - state_ptr->pools = pools_ptr; + // Init Dummy structs + struct ncclProxyArgs* pool_ptr = new ncclProxyArgs; + struct ncclProxyPool* pools_ptr = new ncclProxyPool; + struct ncclProxyPool* pools2_ptr = new ncclProxyPool; + struct ncclProxyProgressState* state_ptr = new ncclProxyProgressState; - pools_ptr->next = pools2_ptr; + // state_ptr = &state; + state_ptr->active = &pools_ptr->elems[1]; // chk + state_ptr->pool = pool_ptr; + state_ptr->pools = pools_ptr; - struct ncclProxyArgs *x = - &pools_ptr->elems[5]; // Passing the 5th element of the pool - struct ncclProxyProgressState *y = state_ptr; - y->pools->next = y->pools; // next points to self + pools_ptr->next = pools2_ptr; - INFO(NCCL_LOG_INFO, "[ProxyTests] x=%u y->pools=%u x-y=%u \n", x, - y->pools->elems, x - y->pools->elems); + struct ncclProxyArgs* x = &pools_ptr->elems[5]; // Passing the 5th element of the pool + struct ncclProxyProgressState* y = state_ptr; + y->pools->next = y->pools; // next points to self - int pool_idx, opIndex; - ncclResult_t res = getOpIndex(x, y, &pool_idx, &opIndex); + INFO( + "[ProxyTests] x=%p y->pools=%p x-y=%ld \n", + (void*)x, + (void*)y->pools->elems, + x - y->pools->elems + ); - ASSERT_EQ(pool_idx, 0); - ASSERT_EQ(opIndex, 5); + int pool_idx, opIndex; + ncclResult_t res = getOpIndex(x, y, &pool_idx, &opIndex); - INFO(NCCL_LOG_INFO, "[ProxyTests] pool_idx %d opIndex %d \n", pool_idx, - opIndex); - INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", res); - assert(res == ncclSuccess); + ASSERT_EQ(pool_idx, 0); + ASSERT_EQ(opIndex, 5); + + INFO("[ProxyTests] pool_idx %d opIndex %d \n", pool_idx, opIndex); + INFO("[ProxyTests] res %u \n", res); + assert(res == ncclSuccess); delete pool_ptr; delete pools_ptr; delete pools2_ptr; delete state_ptr; - INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n"); + INFO("[ProxyTests] Test Complete \n"); } -TEST(ProxyTests, printProxyOp) { - INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n"); - // Init Dummy structs +TEST(ProxyTests, printProxyOp) +{ + INFO("[ProxyTests] Test Start \n"); + // Init Dummy structs - struct ncclProxyArgs *pool_ptr = new ncclProxyArgs; + struct ncclProxyArgs* pool_ptr = new ncclProxyArgs; - struct ncclProxyPool *pools_ptr = new ncclProxyPool; - struct ncclProxyPool *pools2_ptr = new ncclProxyPool; + struct ncclProxyPool* pools_ptr = new ncclProxyPool; + struct ncclProxyPool* pools2_ptr = new ncclProxyPool; - struct ncclProxyProgressState *state_ptr = new ncclProxyProgressState; + struct ncclProxyProgressState* state_ptr = new ncclProxyProgressState; - // state_ptr = &state; - state_ptr->active = &pools_ptr->elems[1]; // chk - state_ptr->pool = pool_ptr; - state_ptr->pools = pools_ptr; + // state_ptr = &state; + state_ptr->active = &pools_ptr->elems[1]; // chk + state_ptr->pool = pool_ptr; + state_ptr->pools = pools_ptr; - pools_ptr->next = pools2_ptr; + pools_ptr->next = pools2_ptr; - struct ncclProxyArgs *x = &pools_ptr->elems[5]; - struct ncclProxyProgressState *y = state_ptr; - y->pools->next = y->pools; // next points to self + struct ncclProxyArgs* x = &pools_ptr->elems[5]; + struct ncclProxyProgressState* y = state_ptr; + y->pools->next = y->pools; // next points to self - INFO(NCCL_LOG_INFO, "[ProxyTests] x=%u y->pools=%u x-y=%u \n", x, - y->pools->elems, x - y->pools->elems); + INFO( + "[ProxyTests] x=%p y->pools=%p x-y=%ld \n", + (void*)x, + (void*)y->pools->elems, + x - y->pools->elems + ); - init_ncclProxyArgs_struct(pool_ptr); + init_ncclProxyArgs_struct(pool_ptr); - int pool_idx = 2, opIndex = 3; // random vals - ncclResult_t res = printProxyOp(pool_ptr, pool_idx, opIndex); + int pool_idx = 2, opIndex = 3; // random vals + ncclResult_t res = printProxyOp(pool_ptr, pool_idx, opIndex); - INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", res); - assert(res == ncclSuccess); + INFO("[ProxyTests] res %u \n", res); + assert(res == ncclSuccess); delete pools_ptr; delete pools2_ptr; delete pool_ptr; delete state_ptr; - INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n"); + INFO("[ProxyTests] Test Complete \n"); } -TEST(ProxyTests, dumpProxyState) { - INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n"); +TEST(ProxyTests, dumpProxyState) +{ + INFO("[ProxyTests] Test Start \n"); - // Init Dummy structs - struct ncclProxyArgs *pool_ptr; - struct ncclProxyPool *pools_ptr = new ncclProxyPool; - struct ncclProxyPool *pools2_ptr = new ncclProxyPool; + // Init Dummy structs + struct ncclProxyArgs* pool_ptr; + struct ncclProxyPool* pools_ptr = new ncclProxyPool; + struct ncclProxyPool* pools2_ptr = new ncclProxyPool; - struct ncclProxyProgressState *state_ptr = new ncclProxyProgressState; + struct ncclProxyProgressState* state_ptr = new ncclProxyProgressState; - state_ptr->active = &pools_ptr->elems[1]; - pool_ptr = &pools_ptr->elems[4]; - pool_ptr->next = NULL; - pool_ptr->nextPeer = NULL; + state_ptr->active = &pools_ptr->elems[1]; + pool_ptr = &pools_ptr->elems[4]; + pool_ptr->next = NULL; + pool_ptr->nextPeer = NULL; - state_ptr->pool = pool_ptr; - state_ptr->pool->next = NULL; - state_ptr->pool->nextPeer = NULL; - state_ptr->pool->state = OP_SEEN; - state_ptr->pools = pools_ptr; - state_ptr->pools->next = NULL; + state_ptr->pool = pool_ptr; + state_ptr->pool->next = NULL; + state_ptr->pool->nextPeer = NULL; + state_ptr->pool->state = OP_SEEN; + state_ptr->pools = pools_ptr; + state_ptr->pools->next = NULL; - struct ncclProxyArgs *op = state_ptr->active; - op->state = OP_SEEN; - op->nextPeer = NULL; - op->next = NULL; + struct ncclProxyArgs* op = state_ptr->active; + op->state = OP_SEEN; + op->nextPeer = NULL; + op->next = NULL; - pools_ptr->next = NULL; + pools_ptr->next = NULL; - init_ncclProxyArgs_struct(pool_ptr); + init_ncclProxyArgs_struct(pool_ptr); - int pool_idx = 2, opIndex = 3; // random vals - ncclResult_t res = dumpProxyState(state_ptr); + int pool_idx = 2, opIndex = 3; // random vals + ncclResult_t res = dumpProxyState(state_ptr); - INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", res); - ASSERT_EQ(res, ncclSuccess); + INFO("[ProxyTests] res %u \n", res); + ASSERT_EQ(res, ncclSuccess); delete pools_ptr; - + delete pools2_ptr; - - + delete state_ptr; - INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n"); + INFO("[ProxyTests] Test Complete \n"); } -TEST(ProxyTests, ncclProxyCallBlockingUDS) { - INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n"); +TEST(ProxyTests, ncclProxyCallBlockingUDS) +{ + INFO("[ProxyTests] Test Start \n"); - // Init Dummy structs - struct ncclComm *comm = new ncclComm; - int *arr = new int[100]; - for (int i = 0; i < 100; i++) { - arr[i] = i; - } + // Init Dummy structs + struct ncclComm* comm = new ncclComm; + int* arr = new int[100]; + for(int i = 0; i < 100; i++) + { + arr[i] = i; + } - comm->topParentLocalRanks = arr; - comm->localRank = 10; + comm->topParentLocalRanks = arr; + comm->localRank = 10; - int *arr_x = new int[20]; - for (int i = 0; i < 20; i++) { - arr_x[i] = i; - } - comm->topParentRanks = arr_x; + int* arr_x = new int[20]; + for(int i = 0; i < 20; i++) + { + arr_x[i] = i; + } + comm->topParentRanks = arr_x; - struct ncclProxyState *sharedProxyState = new ncclProxyState; - uint64_t *arr2 = new uint64_t[10]; - for (int i = 0; i < 10; i++) { - arr2[i] = 122567 + i; // random - } + struct ncclProxyState* sharedProxyState = new ncclProxyState; + uint64_t* arr2 = new uint64_t[10]; + for(int i = 0; i < 10; i++) + { + arr2[i] = 122567 + i; // random + } - INFO(NCCL_LOG_INFO, "[ProxyTests] sizeof(ncclProxyConnector) = %u\n", - sizeof(ncclProxyConnector)); - struct ncclProxyConnector *proxyConn = - new (std::nothrow) ncclProxyConnector[20]; - if (proxyConn == nullptr) { - // Handle allocation failure - INFO(NCCL_LOG_INFO, "[ProxyTests] Allocation failed\n"); - ASSERT_NE(proxyConn, nullptr); - } + INFO("[ProxyTests] sizeof(ncclProxyConnector) = %zu\n", sizeof(ncclProxyConnector)); + struct ncclProxyConnector* proxyConn = new(std::nothrow) ncclProxyConnector[20]; + if(proxyConn == nullptr) + { + // Handle allocation failure + INFO("[ProxyTests] Allocation failed\n"); + ASSERT_NE(proxyConn, nullptr); + } - proxyConn->tpRank = 2; + proxyConn->tpRank = 2; - comm->proxyState = sharedProxyState; + comm->proxyState = sharedProxyState; - comm->proxyState->peerAddressesUDS = arr2; + comm->proxyState->peerAddressesUDS = arr2; - comm->abortFlag = NULL; + comm->abortFlag = NULL; - int rank = comm->topParentLocalRanks[comm->localRank]; - INFO(NCCL_LOG_INFO, "[ProxyTests] rank %d\n", rank); - uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank]; - INFO(NCCL_LOG_INFO, "[ProxyTests] pidHash %u \n", pidHash); + int rank = comm->topParentLocalRanks[comm->localRank]; + INFO("[ProxyTests] rank %d\n", rank); + uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank]; + INFO("[ProxyTests] pidHash %lu \n", pidHash); - int type = ncclProxyMsgGetFd; - // some memory on stack for storing request and response buffers - uint64_t *x_mem = new uint64_t[10]; - uint64_t *x_mem2 = new uint64_t[10]; - void *reqBuff = (void *)x_mem; - int reqSize = sizeof(uint64_t) * 5; - void *respBuff = NULL; - int respSize = 0; - int *reqFd = NULL; - int *respFd = (int *)x_mem2; + int type = ncclProxyMsgGetFd; + // some memory on stack for storing request and response buffers + uint64_t* x_mem = new uint64_t[10]; + uint64_t* x_mem2 = new uint64_t[10]; + void* reqBuff = (void*)x_mem; + int reqSize = sizeof(uint64_t) * 5; + void* respBuff = NULL; + int respSize = 0; + int* reqFd = NULL; + int* respFd = (int*)x_mem2; - ncclResult_t res = - ncclProxyCallBlockingUDS(comm, proxyConn, type, reqBuff, reqSize, - respBuff, respSize, reqFd, respFd); + ncclResult_t res = ncclProxyCallBlockingUDS( + comm, + proxyConn, + type, + reqBuff, + reqSize, + respBuff, + respSize, + reqFd, + respFd + ); - bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError); - INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", bool_res); - ASSERT_EQ(bool_res, true); + bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError); + INFO("[ProxyTests] res %u \n", bool_res); + ASSERT_EQ(bool_res, true); delete comm; delete sharedProxyState; - delete proxyConn; + delete[] proxyConn; delete[] arr_x; delete[] arr; delete[] arr2; delete[] x_mem; delete[] x_mem2; - INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n"); + INFO("[ProxyTests] Test Complete \n"); } -TEST(ProxyTests, ncclProxyClientGetFdBlocking) { - INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n"); +TEST(ProxyTests, ncclProxyClientGetFdBlocking) +{ + RUN_ISOLATED_TEST( + "ncclProxyClientGetFdBlocking", + []() + { + INFO("[ProxyTests] Test Start \n"); - // Init Dummy structs - struct ncclComm *comm = new ncclComm; - int *arr = new int[100]; - for (int i = 0; i < 100; i++) { - arr[i] = i; - } + // Init Dummy structs + struct ncclComm* comm = new ncclComm; + int* arr = new int[100]; + for(int i = 0; i < 100; i++) + { + arr[i] = i; + } - comm->topParentLocalRanks = arr; - comm->localRank = 10; - struct ncclProxyState *sharedProxyState = new ncclProxyState; + comm->topParentLocalRanks = arr; + comm->localRank = 10; + struct ncclProxyState* sharedProxyState = new ncclProxyState; - int *arr_x = new int[20]; - for (int i = 0; i < 20; i++) { - arr_x[i] = i; - } - comm->topParentRanks = arr_x; + int* arr_x = new int[20]; + for(int i = 0; i < 20; i++) + { + arr_x[i] = i; + } + comm->topParentRanks = arr_x; - uint64_t *arr2 = new uint64_t[10]; - for (int i = 0; i < 10; i++) { - arr2[i] = 122567 + i; // random - } + uint64_t* arr2 = new uint64_t[10]; + for(int i = 0; i < 10; i++) + { + arr2[i] = 122567 + i; // random + } - struct ncclProxyConnector *proxyConn = - new (std::nothrow) ncclProxyConnector[20]; - if (proxyConn == nullptr) { - // Handle allocation failure - INFO(NCCL_LOG_INFO, "[ProxyTests] Allocation failed\n"); - ASSERT_NE(proxyConn, nullptr); - } + struct ncclProxyConnector* proxyConn = new(std::nothrow) ncclProxyConnector[20]; + if(proxyConn == nullptr) + { + // Handle allocation failure + INFO("[ProxyTests] Allocation failed\n"); + ASSERT_NE(proxyConn, nullptr); + } - proxyConn->tpRank = 2; - comm->proxyState = sharedProxyState; - comm->proxyState->peerAddressesUDS = arr2; - comm->abortFlag = NULL; + proxyConn->tpRank = 2; + comm->proxyState = sharedProxyState; + comm->proxyState->peerAddressesUDS = arr2; + comm->abortFlag = NULL; - int rank = comm->topParentLocalRanks[comm->localRank]; - INFO(NCCL_LOG_INFO, "[ProxyTests] rank %d\n", rank); - uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank]; - INFO(NCCL_LOG_INFO, "[ProxyTests] pidHash %u \n", pidHash); + int rank = comm->topParentLocalRanks[comm->localRank]; + INFO("[ProxyTests] rank %d\n", rank); + uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank]; + INFO("[ProxyTests] pidHash %lu \n", pidHash); - int type = ncclProxyMsgGetFd; - // some memory on stack for storing request and response buffers - uint64_t *x_mem = new uint64_t[10]; - uint64_t *x_mem2 = new uint64_t[10]; - void *reqBuff = (void *)x_mem; - int reqSize = sizeof(uint64_t) * 5; - void *respBuff = NULL; - int respSize = 0; - int *reqFd = NULL; - int *respFd = (int *)x_mem2; + int type = ncclProxyMsgGetFd; + // some memory on stack for storing request and response buffers + uint64_t* x_mem = new uint64_t[10]; + uint64_t* x_mem2 = new uint64_t[10]; + void* reqBuff = (void*)x_mem; + int reqSize = sizeof(uint64_t) * 5; + void* respBuff = NULL; + int respSize = 0; + int* reqFd = NULL; + int* respFd = (int*)x_mem2; - comm->gproxyConn = proxyConn; - comm->gproxyConn[rank].initialized = true; + comm->gproxyConn = proxyConn; + comm->gproxyConn[rank].initialized = true; - ncclResult_t res = ncclProxyClientGetFdBlocking(comm, rank, reqBuff, respFd); + ncclResult_t res = ncclProxyClientGetFdBlocking(comm, rank, reqBuff, respFd); - bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError); - INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", bool_res); - ASSERT_EQ(bool_res, true); + bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError); + INFO("[ProxyTests] res %u \n", bool_res); + ASSERT_EQ(bool_res, true); - delete comm; - delete sharedProxyState; - delete proxyConn; - delete[] arr_x; - delete[] arr; - delete[] arr2; - delete[] x_mem; - delete[] x_mem2; - INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n"); + delete comm; + delete sharedProxyState; + delete[] proxyConn; + delete[] arr_x; + delete[] arr; + delete[] arr2; + delete[] x_mem; + delete[] x_mem2; + INFO("[ProxyTests] Test Complete \n"); + INFO("Test 'ncclProxyClientGetFdBlocking' PASSED\n"); + } + ); } -TEST(ProxyTests, ncclProxyClientQueryFdBlocking) { - INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n"); +TEST(ProxyTests, ncclProxyClientQueryFdBlocking) +{ + RUN_ISOLATED_TEST( + "ncclProxyClientQueryFdBlocking", + []() + { + INFO("[ProxyTests] Test Start \n"); - // Init Dummy structs - struct ncclComm *comm = new ncclComm; - int *arr = new int[100]; - for (int i = 0; i < 5; i++) { - arr[i] = i; - } + // Init Dummy structs + struct ncclComm* comm = new ncclComm; + int* arr = new int[100]; + for(int i = 0; i < 5; i++) + { + arr[i] = i; + } - comm->topParentLocalRanks = arr; - comm->localRank = 0; + comm->topParentLocalRanks = arr; + comm->localRank = 0; - int *arr_x = new int[20]; - for (int i = 0; i < 20; i++) { - arr_x[i] = i; - } - comm->topParentRanks = arr_x; + int* arr_x = new int[20]; + for(int i = 0; i < 20; i++) + { + arr_x[i] = i; + } + comm->topParentRanks = arr_x; - struct ncclProxyState *sharedProxyState = new ncclProxyState; + struct ncclProxyState* sharedProxyState = new ncclProxyState; - uint64_t *arr2 = new uint64_t[10]; - for (int i = 0; i < 10; i++) { - arr2[i] = 122567 + i; // random - } + uint64_t* arr2 = new uint64_t[10]; + for(int i = 0; i < 10; i++) + { + arr2[i] = 122567 + i; // random + } - struct ncclProxyConnector *proxyConn = - new (std::nothrow) ncclProxyConnector[20]; - if (proxyConn == nullptr) { - // Handle allocation failure - INFO(NCCL_LOG_INFO, "[ProxyTests] Allocation failed\n"); - ASSERT_NE(proxyConn, nullptr); - } + struct ncclProxyConnector* proxyConn = new(std::nothrow) ncclProxyConnector[20]; + if(proxyConn == nullptr) + { + // Handle allocation failure + INFO("[ProxyTests] Allocation failed\n"); + ASSERT_NE(proxyConn, nullptr); + } - proxyConn->tpRank = 2; + proxyConn->tpRank = 2; - comm->proxyState = sharedProxyState; + comm->proxyState = sharedProxyState; - comm->proxyState->peerAddressesUDS = arr2; + comm->proxyState->peerAddressesUDS = arr2; - comm->abortFlag = NULL; + comm->abortFlag = NULL; - int rank = comm->topParentLocalRanks[comm->localRank]; - INFO(NCCL_LOG_INFO, "[ProxyTests] rank %d\n", rank); - uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank]; - INFO(NCCL_LOG_INFO, "[ProxyTests] pidHash %u \n", pidHash); + int rank = comm->topParentLocalRanks[comm->localRank]; + INFO("[ProxyTests] rank %d\n", rank); + uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank]; + INFO("[ProxyTests] pidHash %lu \n", pidHash); - int type = ncclProxyMsgGetFd; - // some memory on stack for storing request and response buffers - uint64_t *x_mem = new uint64_t[10]; - uint64_t *x_mem2 = new uint64_t[10]; - void *reqBuff = (void *)x_mem; - int reqSize = sizeof(uint64_t) * 5; - void *respBuff = NULL; - int respSize = 0; - int *reqFd = NULL; - int *respFd = (int *)x_mem2; + int type = ncclProxyMsgGetFd; + // some memory on stack for storing request and response buffers + uint64_t* x_mem = new uint64_t[10]; + uint64_t* x_mem2 = new uint64_t[10]; + void* reqBuff = (void*)x_mem; + int reqSize = sizeof(uint64_t) * 5; + void* respBuff = NULL; + int respSize = 0; + int* reqFd = NULL; + int* respFd = (int*)x_mem2; - comm->gproxyConn = proxyConn; - comm->gproxyConn[rank].initialized = true; + comm->gproxyConn = proxyConn; + comm->gproxyConn[rank].initialized = true; - int localFd = 0; - int dummy_int = 20; - respBuff = &dummy_int; - ncclResult_t res = - ncclProxyClientQueryFdBlocking(comm, proxyConn, localFd, (int *)respBuff); + int localFd = 0; + int dummy_int = 20; + respBuff = &dummy_int; + ncclResult_t res + = ncclProxyClientQueryFdBlocking(comm, proxyConn, localFd, (int*)respBuff); - bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError); - INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", bool_res); - ASSERT_EQ(bool_res, true); + bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError); + INFO("[ProxyTests] res %u \n", bool_res); + ASSERT_EQ(bool_res, true); - delete comm; - delete sharedProxyState; - delete proxyConn; - delete[] arr_x; - delete[] arr; - delete[] arr2; - delete[] x_mem; - delete[] x_mem2; - INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n"); + delete comm; + delete sharedProxyState; + delete[] proxyConn; + delete[] arr_x; + delete[] arr; + delete[] arr2; + delete[] x_mem; + delete[] x_mem2; + INFO("[ProxyTests] Test Complete \n"); + INFO("Test 'ncclProxyClientQueryFdBlocking' PASSED\n"); + } + ); } } // namespace RcclUnitTesting diff --git a/projects/rccl/test/README.md b/projects/rccl/test/README.md index 9268b6ceaa..e6e323d4c9 100644 --- a/projects/rccl/test/README.md +++ b/projects/rccl/test/README.md @@ -14,7 +14,7 @@ The RCCL test suite provides following frameworks along with the existing rccl-U ## Testing Frameworks -Following are two new complementary testing frameworks for different testing needs: +Following is a new testing framework for running single node & single process test in isolation: ### 1. Process Isolated Test Runner Run tests in isolated processes with clean environment settings. diff --git a/projects/rccl/test/RcclWrapTests.cpp b/projects/rccl/test/RcclWrapTests.cpp index bbb378b371..3b41647e1b 100644 --- a/projects/rccl/test/RcclWrapTests.cpp +++ b/projects/rccl/test/RcclWrapTests.cpp @@ -4,2316 +4,1306 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "comm.h" // Ensure full definition of struct ncclComm -#include "debug.h" -#include "graph/topo.h" -#include -#include #include #include -namespace RcclUnitTesting { +#include +#include -// Static flag to ensure only one rcclSetP2pNetChunkSize test runs per execution -static bool s_p2pNetChunkSizeTestExecuted = false; +#include "comm.h" +#include "common/ProcessIsolatedTestRunner.hpp" +#include "debug.h" +#include "graph/topo.h" -// Helper function to check if P2P test should be skipped due to execution order -static bool ShouldSkipP2pTestDueToExecutionOrder(const std::string &testName) { - if (s_p2pNetChunkSizeTestExecuted) { - INFO(NCCL_LOG_INFO, - "\n=== IMPORTANT NOTE ===\n" - "Test '%s' is being skipped because another rcclSetP2pNetChunkSize " - "test\n" - "has already executed in this run. The rcclSetP2pNetChunkSize " - "function uses a static\n" - "variable that gets initialized on first call, which affects " - "subsequent tests.\n" - "\nTo run this test properly, execute it individually using:\n" - " --gtest_filter=Rcclwrap.%s\n" - "\nOr run each rcclSetP2pNetChunkSize test in separate executions to " - "ensure\n" - "proper static variable initialization.\n" - "========================\n", - testName.c_str(), testName.c_str()); - return true; - } - - // Mark that a P2P test is now executing - s_p2pNetChunkSizeTestExecuted = true; - return false; -} +namespace RcclUnitTesting +{ // Helper function to determine if P2P test should be skipped due to static // variable state -static bool ShouldSkipP2pTest(const char *requiredEnvValue = nullptr) { - const char *envValue = getenv("NCCL_P2P_NET_CHUNKSIZE"); +static bool ShouldSkipP2pTest(const char* requiredEnvValue = nullptr) +{ + const char* envValue = getenv("NCCL_P2P_NET_CHUNKSIZE"); - // If a specific environment value is required, check for it - if (requiredEnvValue != nullptr) { - if (!envValue || strcmp(envValue, requiredEnvValue) != 0) { - return true; // Skip if env var is not set to required value + // If a specific environment value is required, check for it + if(requiredEnvValue != nullptr) + { + if(!envValue || strcmp(envValue, requiredEnvValue) != 0) + { + return true; // Skip if env var is not set to required value + } + return false; // Don't skip if env var matches required value } - return false; // Don't skip if env var matches required value - } - // For architecture logic tests, skip only if environment variable is set - // (which would override the static variable behavior) - // Note: We cannot directly check if static variable is RCCL_VALUE_UNSET - // from test code, so we rely on clean environment for proper testing - if (envValue != nullptr) { - return true; // Skip if env var is set (prevents testing architecture logic) - } + // For architecture logic tests, skip only if environment variable is set + // (which would override the static variable behavior) + // Note: We cannot directly check if static variable is RCCL_VALUE_UNSET + // from test code, so we rely on clean environment for proper testing + if(envValue != nullptr) + { + return true; // Skip if env var is set (prevents testing architecture logic) + } - // Environment is clean - proceed with test - // Warning: Static variable might still be initialized from previous tests - // For guaranteed clean state, run tests individually or restart binary - return false; // Don't skip -} - -// Static flag to ensure only one rcclSetPxn test runs per execution -static bool s_pxnTestExecuted = false; - -// Helper function to check if PXN test should be skipped due to execution order -static bool ShouldSkipPxnTestDueToExecutionOrder(const std::string &testName) { - if (s_pxnTestExecuted) { - INFO(NCCL_LOG_INFO, - "\n=== IMPORTANT NOTE ===\n" - "Test '%s' is being skipped because another rcclSetPxn test\n" - "has already executed in this run. The rcclSetPxn function uses a " - "static\n" - "variable that gets initialized on first call, which affects " - "subsequent tests.\n" - "\nTo run this test properly, execute it individually using:\n" - " --gtest_filter=Rcclwrap.%s\n" - "\nOr run each rcclSetPxn test in separate executions to ensure\n" - "proper static variable initialization.\n" - "========================\n", - testName.c_str(), testName.c_str()); - return true; - } - - // Mark that a PXN test is now executing - s_pxnTestExecuted = true; - return false; + // Environment is clean - proceed with test + // Warning: Static variable might still be initialized from previous tests + // For guaranteed clean state, run tests individually or restart binary + return false; // Don't skip } // Helper function to determine if PXN test should be skipped due to static // variable state -static bool ShouldSkipPxnTest(const char *requiredEnvValue = nullptr) { - const char *envValue = getenv("NCCL_PXN_DISABLE"); +static bool ShouldSkipPxnTest(const char* requiredEnvValue = nullptr) +{ + const char* envValue = getenv("NCCL_PXN_DISABLE"); - // If a specific environment value is required, check for it - if (requiredEnvValue != nullptr) { - if (!envValue || strcmp(envValue, requiredEnvValue) != 0) { - return true; // Skip if env var is not set to required value + // If a specific environment value is required, check for it + if(requiredEnvValue != nullptr) + { + if(!envValue || strcmp(envValue, requiredEnvValue) != 0) + { + return true; // Skip if env var is not set to required value + } + return false; // Don't skip if env var matches required value } - return false; // Don't skip if env var matches required value - } - // For architecture logic tests, skip only if environment variable is set - // (which would override the static variable behavior) - if (envValue != nullptr) { - return true; // Skip if env var is set (prevents testing architecture logic) - } + // For architecture logic tests, skip only if environment variable is set + // (which would override the static variable behavior) + if(envValue != nullptr) + { + return true; // Skip if env var is set (prevents testing architecture logic) + } - // Environment is clean - proceed with test - return false; // Don't skip + // Environment is clean - proceed with test + return false; // Don't skip } // Helper function to test the static expose check -ncclResult_t testStaticExposeCheck() { - RCCL_STATIC_EXPOSE_CHECK(); - return ncclSuccess; +ncclResult_t testStaticExposeCheck() +{ + RCCL_STATIC_EXPOSE_CHECK(); + return ncclSuccess; } // Helper function to create and initialize mock communicator -static void CreateMockComm(ncclComm_t &mockComm, - struct ncclTopoSystem &mockTopo, - struct ncclTopoNode &mockGpuNode, const char *arch, - int nRanks) { - // Allocate memory for the communicator - mockComm = new ncclComm(); - memset(mockComm, 0, sizeof(ncclComm)); +static void CreateMockComm( + ncclComm_t& mockComm, + struct ncclTopoSystem& mockTopo, + struct ncclTopoNode& mockGpuNode, + const char* arch, + int nRanks +) +{ + // Allocate memory for the communicator + mockComm = new ncclComm(); + memset(mockComm, 0, sizeof(ncclComm)); - // Initialize basic communicator fields - mockComm->nRanks = nRanks; - mockComm->nNodes = 1; // Default to single node for P2P tests - mockComm->rank = 0; // Default rank + // Initialize basic communicator fields + mockComm->nRanks = nRanks; + mockComm->nNodes = 1; // Default to single node for P2P tests + mockComm->rank = 0; // Default rank - // Initialize topology - memset(&mockTopo, 0, sizeof(mockTopo)); - mockComm->topo = &mockTopo; + // Initialize topology + memset(&mockTopo, 0, sizeof(mockTopo)); + mockComm->topo = &mockTopo; - // Initialize GPU node - mockTopo.nodes[GPU].count = 1; - memset(&mockGpuNode, 0, sizeof(mockGpuNode)); + // Initialize GPU node + mockTopo.nodes[GPU].count = 1; + memset(&mockGpuNode, 0, sizeof(mockGpuNode)); - // Set GPU architecture - strncpy(mockGpuNode.gpu.gcn, arch, sizeof(mockGpuNode.gpu.gcn) - 1); - mockGpuNode.gpu.gcn[sizeof(mockGpuNode.gpu.gcn) - 1] = '\0'; + // Set GPU architecture + strncpy(mockGpuNode.gpu.gcn, arch, sizeof(mockGpuNode.gpu.gcn) - 1); + mockGpuNode.gpu.gcn[sizeof(mockGpuNode.gpu.gcn) - 1] = '\0'; - // Copy the node into the topology array - mockTopo.nodes[GPU].nodes[0] = mockGpuNode; + // Copy the node into the topology array + mockTopo.nodes[GPU].nodes[0] = mockGpuNode; - // Initialize other required fields for tests - memset(mockComm->minMaxLLRange, 0, sizeof(mockComm->minMaxLLRange)); + // Initialize other required fields for tests + memset(mockComm->minMaxLLRange, 0, sizeof(mockComm->minMaxLLRange)); } // Helper function to cleanup mock communicator -static void CleanupMockComm(ncclComm_t &mockComm) { - if (mockComm) { - delete mockComm; - mockComm = nullptr; - } +static void CleanupMockComm(ncclComm_t& mockComm) +{ + if(mockComm) + { + delete mockComm; + mockComm = nullptr; + } } // Helper function to determine if rcclSetPipelining test should be skipped -static bool ShouldSkipRcclSetPipeliningTests() { - const char *disable = getenv("RCCL_DISABLE_REDUCE_COPY_PIPELINING"); - // Skip the test if RCCL_DISABLE_REDUCE_COPY_PIPELINING is set - if (disable && strcmp(disable, "0") != 0) { - return true; - } - return false; +static bool ShouldSkipRcclSetPipeliningTests() +{ + const char* disable = getenv("RCCL_DISABLE_REDUCE_COPY_PIPELINING"); + // Skip the test if RCCL_DISABLE_REDUCE_COPY_PIPELINING is set + if(disable && strcmp(disable, "0") != 0) + { + return true; + } + return false; } // Helper function to validate protocol string against known valid protocols -static bool isProtoStrValid(const char *envStr) { - if (!envStr) - return false; - for (int i = 0; i < NCCL_NUM_PROTOCOLS; ++i) { - if (strcasecmp(envStr, ncclProtoStr[i]) == 0) { - return true; // Match found +static bool isProtoStrValid(const char* envStr) +{ + if(!envStr) + return false; + for(int i = 0; i < NCCL_NUM_PROTOCOLS; ++i) + { + if(strcasecmp(envStr, ncclProtoStr[i]) == 0) + { + return true; // Match found + } } - } - return false; // No match found + return false; // No match found } // Helper function to validate algorithm string against known valid algorithms -static bool isAlgoStrValid(const char *envStr) { - if (!envStr) - return false; - for (int i = 0; i < NCCL_NUM_ALGORITHMS; ++i) { - if (strcasecmp(envStr, ncclAlgoStr[i]) == 0) { - return true; // Match found +static bool isAlgoStrValid(const char* envStr) +{ + if(!envStr) + return false; + for(int i = 0; i < NCCL_NUM_ALGORITHMS; ++i) + { + if(strcasecmp(envStr, ncclAlgoStr[i]) == 0) + { + return true; // Match found + } } - } - return false; // No match found + return false; // No match found } -TEST(Rcclwrap, RcclFuncMaxSendRecvCount) { - ncclResult_t staticCheckResult = testStaticExposeCheck(); +TEST(Rcclwrap, RcclFuncMaxSendRecvCount) +{ + ncclResult_t staticCheckResult = testStaticExposeCheck(); #ifdef RCCL_EXPOSE_STATIC - EXPECT_EQ(staticCheckResult, ncclSuccess); + EXPECT_EQ(staticCheckResult, ncclSuccess); #else - EXPECT_EQ(staticCheckResult, ncclInvalidUsage); + EXPECT_EQ(staticCheckResult, ncclInvalidUsage); #endif - size_t maxCount = 0; - ncclResult_t result = - rcclFuncMaxSendRecvCount(ncclFuncAllReduce, 4, 1024, maxCount); - EXPECT_EQ(maxCount, 1024); - EXPECT_EQ(result, ncclSuccess); + size_t maxCount = 0; + ncclResult_t result = rcclFuncMaxSendRecvCount(ncclFuncAllReduce, 4, 1024, maxCount); + EXPECT_EQ(maxCount, 1024); + EXPECT_EQ(result, ncclSuccess); } -TEST(Rcclwrap, RcclUpdateCollectiveProtocol_UsesLL128WhenInRange) { - setenv("NCCL_PROTO", "", 1); // Trigger auto selection mode - unsetenv("NCCL_PROTO"); +TEST(Rcclwrap, RcclUpdateCollectiveProtocol_UsesLL128WhenInRange) +{ + setenv("NCCL_PROTO", "", 1); // Trigger auto selection mode + unsetenv("NCCL_PROTO"); - ncclComm_t comm = new ncclComm(); - *comm = {}; - // Manually populate minimal fields for comm - comm->nRanks = 1; - comm->nNodes = 2; // triggers inter-node logic - comm->rank = 0; - comm->topo = new ncclTopoSystem(); - *comm->topo = {}; - comm->topo->ll128Enabled = true; - comm->topo->nodes[GPU].nodes[0] = {}; - comm->topo->nodes[GPU].count = 1; - strncpy(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942", - sizeof(comm->topo->nodes[GPU].nodes[0].gpu.gcn)); + ncclComm_t comm = new ncclComm(); + *comm = {}; + // Manually populate minimal fields for comm + comm->nRanks = 1; + comm->nNodes = 2; // triggers inter-node logic + comm->rank = 0; + comm->topo = new ncclTopoSystem(); + *comm->topo = {}; + comm->topo->ll128Enabled = true; + comm->topo->nodes[GPU].nodes[0] = {}; + comm->topo->nodes[GPU].count = 1; + strncpy( + comm->topo->nodes[GPU].nodes[0].gpu.gcn, + "gfx942", + sizeof(comm->topo->nodes[GPU].nodes[0].gpu.gcn) + ); - int idx = rcclGetTunableIndex(ncclFuncAllReduce); - comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MIN_IDX] = 512; - comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MAX_IDX] = 1024; - comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_MIN_IDX] = 256; - comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_MAX_IDX] = 2048; - comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_FACTOR_IDX] = 1; + int idx = rcclGetTunableIndex(ncclFuncAllReduce); + comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MIN_IDX] = 512; + comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MAX_IDX] = 1024; + comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_MIN_IDX] = 256; + comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_MAX_IDX] = 2048; + comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_FACTOR_IDX] = 1; - ncclTaskColl info = {}; - // Manually populate minimal fields for info - info.func = ncclFuncAllReduce; - info.protocol = NCCL_PROTO_UNDEF; + ncclTaskColl info = {}; + // Manually populate minimal fields for info + info.func = ncclFuncAllReduce; + info.protocol = NCCL_PROTO_UNDEF; - size_t nBytes = 1024; + size_t nBytes = 1024; - rcclUpdateCollectiveProtocol(comm, nBytes, &info); - EXPECT_TRUE(info.protocol == NCCL_PROTO_LL128 || - info.protocol == NCCL_PROTO_LL); + rcclUpdateCollectiveProtocol(comm, nBytes, &info); + EXPECT_TRUE(info.protocol == NCCL_PROTO_LL128 || info.protocol == NCCL_PROTO_LL); - delete comm->topo; - delete comm; + delete comm->topo; + delete comm; } -TEST(Rcclwrap, RcclUpdateCollectiveProtocol_WarnsOnGfx942Arch) { - setenv("NCCL_PROTO", "", 1); - unsetenv("NCCL_PROTO"); +TEST(Rcclwrap, RcclUpdateCollectiveProtocol_WarnsOnGfx942Arch) +{ + setenv("NCCL_PROTO", "", 1); + unsetenv("NCCL_PROTO"); - ncclComm_t comm = new ncclComm(); - *comm = {}; - // Manually populate minimal fields for comm - comm->nRanks = 1; - comm->nNodes = 2; // triggers inter-node logic - comm->rank = 0; - comm->topo = new ncclTopoSystem(); - comm->topo->ll128Enabled = true; - comm->topo->nodes[GPU].nodes[0] = {}; - strncpy(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942", - sizeof(comm->topo->nodes[GPU].nodes[0].gpu.gcn)); + ncclComm_t comm = new ncclComm(); + *comm = {}; + // Manually populate minimal fields for comm + comm->nRanks = 1; + comm->nNodes = 2; // triggers inter-node logic + comm->rank = 0; + comm->topo = new ncclTopoSystem(); + comm->topo->ll128Enabled = true; + comm->topo->nodes[GPU].nodes[0] = {}; + strncpy( + comm->topo->nodes[GPU].nodes[0].gpu.gcn, + "gfx942", + sizeof(comm->topo->nodes[GPU].nodes[0].gpu.gcn) + ); - int idx = rcclGetTunableIndex(ncclFuncAllReduce); - comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MIN_IDX] = - RCCL_LL_LIMITS_UNDEFINED; - comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MAX_IDX] = - RCCL_LL_LIMITS_UNDEFINED; - comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_MIN_IDX] = - RCCL_LL_LIMITS_UNDEFINED; - comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_MAX_IDX] = - RCCL_LL_LIMITS_UNDEFINED; - comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_FACTOR_IDX] = - RCCL_LL_LIMITS_UNDEFINED; + int idx = rcclGetTunableIndex(ncclFuncAllReduce); + comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MIN_IDX] = RCCL_LL_LIMITS_UNDEFINED; + comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MAX_IDX] = RCCL_LL_LIMITS_UNDEFINED; + comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_MIN_IDX] = RCCL_LL_LIMITS_UNDEFINED; + comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_MAX_IDX] = RCCL_LL_LIMITS_UNDEFINED; + comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_FACTOR_IDX] = RCCL_LL_LIMITS_UNDEFINED; - ncclTaskColl info = {}; - // Manually populate minimal fields for info - info.func = ncclFuncAllReduce; - info.protocol = NCCL_PROTO_UNDEF; - size_t nBytes = 1024; // 1024 per rank for 4 ranks + ncclTaskColl info = {}; + // Manually populate minimal fields for info + info.func = ncclFuncAllReduce; + info.protocol = NCCL_PROTO_UNDEF; + size_t nBytes = 1024; // 1024 per rank for 4 ranks - rcclUpdateCollectiveProtocol(comm, nBytes, &info); - EXPECT_EQ(info.protocol, NCCL_PROTO_UNDEF); + rcclUpdateCollectiveProtocol(comm, nBytes, &info); + EXPECT_EQ(info.protocol, NCCL_PROTO_UNDEF); - delete comm->topo; - delete comm; + delete comm->topo; + delete comm; } -TEST(Rcclwrap, - RcclUpdateCollectiveProtocol_HonorsUserProtocolEnv) { // Why does this pass - // if it does not - // enter the else if - // block - setenv("NCCL_PROTO", "1", 1); // Simulate manual override +TEST(Rcclwrap, RcclUpdateCollectiveProtocol_HonorsUserProtocolEnv) +{ // Why does this pass + // if it does not + // enter the else if + // block + setenv("NCCL_PROTO", "1", 1); // Simulate manual override - ncclComm_t comm = new ncclComm(); - *comm = {}; - // Manually populate minimal fields for comm - comm->nRanks = 1; - comm->nNodes = 2; // triggers inter-node logic - comm->rank = 0; - comm->topo = new ncclTopoSystem(); //(struct ncclTopoSystem*)calloc(1, - // sizeof(struct ncclTopoSystem)); - *comm->topo = {}; - comm->topo->ll128Enabled = true; - comm->topo->nodes[GPU].nodes[0] = {}; - strncpy(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942", - sizeof(comm->topo->nodes[GPU].nodes[0].gpu.gcn)); + ncclComm_t comm = new ncclComm(); + *comm = {}; + // Manually populate minimal fields for comm + comm->nRanks = 1; + comm->nNodes = 2; // triggers inter-node logic + comm->rank = 0; + comm->topo = new ncclTopoSystem(); //(struct ncclTopoSystem*)calloc(1, + // sizeof(struct ncclTopoSystem)); + *comm->topo = {}; + comm->topo->ll128Enabled = true; + comm->topo->nodes[GPU].nodes[0] = {}; + strncpy( + comm->topo->nodes[GPU].nodes[0].gpu.gcn, + "gfx942", + sizeof(comm->topo->nodes[GPU].nodes[0].gpu.gcn) + ); - ncclTaskColl info = {}; - // Manually populate minimal fields for info - info.func = ncclFuncAllReduce; - info.protocol = NCCL_PROTO_UNDEF; - size_t nBytes = 1024; // 1024 per rank for 4 ranks + ncclTaskColl info = {}; + // Manually populate minimal fields for info + info.func = ncclFuncAllReduce; + info.protocol = NCCL_PROTO_UNDEF; + size_t nBytes = 1024; // 1024 per rank for 4 ranks - rcclUpdateCollectiveProtocol(comm, nBytes, &info); - EXPECT_EQ(info.protocol, NCCL_PROTO_UNDEF); + rcclUpdateCollectiveProtocol(comm, nBytes, &info); + EXPECT_EQ(info.protocol, NCCL_PROTO_UNDEF); - delete comm->topo; - delete comm; + delete comm->topo; + delete comm; } -TEST(Rcclwrap, RcclUpdateCollectiveProtocol_SimpleFallbackWhenNoRanges) { - setenv("NCCL_PROTO", "", 1); // Trigger auto selection mode - unsetenv("NCCL_PROTO"); +TEST(Rcclwrap, RcclUpdateCollectiveProtocol_SimpleFallbackWhenNoRanges) +{ + setenv("NCCL_PROTO", "", 1); // Trigger auto selection mode + unsetenv("NCCL_PROTO"); - ncclComm_t comm = new ncclComm(); - *comm = {}; - // Manually populate minimal fields for comm - comm->nRanks = 1; - comm->nNodes = 2; // triggers inter-node logic - comm->rank = 0; - comm->topo = new ncclTopoSystem(); //(struct ncclTopoSystem*)calloc(1, - // sizeof(struct ncclTopoSystem)); - *comm->topo = {}; - comm->topo->ll128Enabled = true; - comm->topo->nodes[GPU].nodes[0] = {}; - comm->topo->nodes[GPU].count = 1; - strncpy(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942", - sizeof(comm->topo->nodes[GPU].nodes[0].gpu.gcn)); + ncclComm_t comm = new ncclComm(); + *comm = {}; + // Manually populate minimal fields for comm + comm->nRanks = 1; + comm->nNodes = 2; // triggers inter-node logic + comm->rank = 0; + comm->topo = new ncclTopoSystem(); //(struct ncclTopoSystem*)calloc(1, + // sizeof(struct ncclTopoSystem)); + *comm->topo = {}; + comm->topo->ll128Enabled = true; + comm->topo->nodes[GPU].nodes[0] = {}; + comm->topo->nodes[GPU].count = 1; + strncpy( + comm->topo->nodes[GPU].nodes[0].gpu.gcn, + "gfx942", + sizeof(comm->topo->nodes[GPU].nodes[0].gpu.gcn) + ); - int idx = rcclGetTunableIndex(ncclFuncAllReduce); - comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MIN_IDX] = 512; - comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MAX_IDX] = 1024; + int idx = rcclGetTunableIndex(ncclFuncAllReduce); + comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MIN_IDX] = 512; + comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MAX_IDX] = 1024; - // Manually populate minimal fields for info - ncclTaskColl info = {}; - info.func = ncclFuncAllReduce; - info.protocol = NCCL_PROTO_UNDEF; - size_t nBytes = 2048; // 1024 per rank for 4 ranks + // Manually populate minimal fields for info + ncclTaskColl info = {}; + info.func = ncclFuncAllReduce; + info.protocol = NCCL_PROTO_UNDEF; + size_t nBytes = 2048; // 1024 per rank for 4 ranks - rcclUpdateCollectiveProtocol(comm, nBytes, &info); - EXPECT_EQ(info.protocol, NCCL_PROTO_SIMPLE); + rcclUpdateCollectiveProtocol(comm, nBytes, &info); + EXPECT_EQ(info.protocol, NCCL_PROTO_SIMPLE); - delete comm->topo; - delete comm; + delete comm->topo; + delete comm; } -TEST(Rcclwrap, validHsaScratchEnvSettingTest) { - // When HSA_NO_SCRATCH_RECLAIM is set, it is always valid - EXPECT_TRUE(validHsaScratchEnvSetting("1", 0, 0, "gfx950")); +TEST(Rcclwrap, validHsaScratchEnvSettingTest) +{ + // When HSA_NO_SCRATCH_RECLAIM is set, it is always valid + EXPECT_TRUE(validHsaScratchEnvSetting("1", 0, 0, "gfx950")); - EXPECT_TRUE(validHsaScratchEnvSetting("1", 0, 0, "gfx942")); + EXPECT_TRUE(validHsaScratchEnvSetting("1", 0, 0, "gfx942")); - // When HSA_NO_SCRATCH_RECLAIM is not set, looking at hip version and firmware - // version - EXPECT_TRUE(validHsaScratchEnvSetting(nullptr, 60443484, 24, "gfx950")); + // When HSA_NO_SCRATCH_RECLAIM is not set, looking at hip version and firmware + // version + EXPECT_TRUE(validHsaScratchEnvSetting(nullptr, 60443484, 24, "gfx950")); - EXPECT_FALSE(validHsaScratchEnvSetting(nullptr, 60443483, 24, "gfx950")); + EXPECT_FALSE(validHsaScratchEnvSetting(nullptr, 60443483, 24, "gfx950")); - EXPECT_FALSE(validHsaScratchEnvSetting(nullptr, 60443484, 23, "gfx950")); + EXPECT_FALSE(validHsaScratchEnvSetting(nullptr, 60443484, 23, "gfx950")); - EXPECT_TRUE(validHsaScratchEnvSetting(nullptr, 60443484, 177, "gfx942")); + EXPECT_TRUE(validHsaScratchEnvSetting(nullptr, 60443484, 177, "gfx942")); - EXPECT_FALSE(validHsaScratchEnvSetting(nullptr, 60443484, 176, "gfx942")); + EXPECT_FALSE(validHsaScratchEnvSetting(nullptr, 60443484, 176, "gfx942")); - EXPECT_FALSE(validHsaScratchEnvSetting(nullptr, 60443483, 177, "gfx942")); + EXPECT_FALSE(validHsaScratchEnvSetting(nullptr, 60443483, 177, "gfx942")); - EXPECT_TRUE(validHsaScratchEnvSetting(nullptr, 60443483, 0, "gfx000")); + EXPECT_TRUE(validHsaScratchEnvSetting(nullptr, 60443483, 0, "gfx000")); - EXPECT_TRUE(validHsaScratchEnvSetting(nullptr, 60300000, 0, "gfx000")); + EXPECT_TRUE(validHsaScratchEnvSetting(nullptr, 60300000, 0, "gfx000")); } -TEST(Rcclwrap, RcclUpdateThreadThreshold_UserEnvSet) { - const char *value = getenv("NCCL_THREAD_THRESHOLDS"); +TEST(Rcclwrap, RcclUpdateThreadThreshold_UserEnvSet) +{ + RUN_ISOLATED_TEST_WITH_ENV( + "RcclUpdateThreadThreshold_UserEnvSet", + []() + { + const char* value = getenv("NCCL_THREAD_THRESHOLDS"); - if (!value) { - INFO(NCCL_LOG_INFO, "[Rcclwrap] Test skipped. Set environment variable " - "NCCL_THREAD_THRESHOLD"); - GTEST_SKIP() << "[Rcclwrap] Test skipped. Set environment variable " - "NCCL_THREAD_THRESHOLD\n"; - } else { - ncclComm comm = {.nRanks = 8, .nNodes = 4}; - ncclTaskColl info = {.func = ncclFuncReduceScatter, .protocol = 0}; + if(!value) + { + INFO( + NCCL_LOG_INFO, + "[Rcclwrap] Test skipped. Set environment variable " + "NCCL_THREAD_THRESHOLD" + ); + GTEST_SKIP() << "[Rcclwrap] Test skipped. Set environment variable " + "NCCL_THREAD_THRESHOLD\n"; + } + else + { + ncclComm comm; + comm.nRanks = 8; + comm.nNodes = 4; + memset(comm.minMaxLLRange, 0, sizeof(comm.minMaxLLRange)); + + ncclTaskColl info; + info.func = ncclFuncReduceScatter; + info.protocol = 0; + + int threadThreshold = 5; // Any number should do, we should make + // sure this number does not change + rcclUpdateThreadThreshold(&comm, 0, &info, threadThreshold); + + EXPECT_EQ(threadThreshold, 5); + } + }, + {{"NCCL_THREAD_THRESHOLDS", "1"}} + ); +} + +TEST(Rcclwrap, RcclUpdateThreadThreshold_MinNChannelsSet) +{ + RUN_ISOLATED_TEST_WITH_ENV( + "RcclUpdateThreadThreshold_MinNChannelsSet", + []() + { + const char* value = getenv("NCCL_MIN_NCHANNELS"); + if(!value) + { + INFO( + NCCL_LOG_INFO, + "[Rcclwrap] Test skipped. Set environment " + "variable NCCL_MIN_NCHANNELS" + ); + GTEST_SKIP() << "[Rcclwrap] Test skipped. Set environment variable " + "NCCL_MIN_NCHANNELS\n"; + } + else + { + ncclComm comm{}; + ncclTaskColl info{}; + int threadThreshold = 5; + + comm.nRanks = 4; + comm.nNodes = 4; + info.func = ncclFuncAllGather; + info.protocol = 0; + memset(comm.minMaxLLRange, 0, sizeof(comm.minMaxLLRange)); + + rcclUpdateThreadThreshold(&comm, 0, &info, threadThreshold); + + EXPECT_EQ(threadThreshold, 5); + } + }, + {{"NCCL_MIN_NCHANNELS", "1"}} + ); +} + +TEST(Rcclwrap, RcclUpdateThreadThreshold_MaxChannelsSet) +{ + RUN_ISOLATED_TEST_WITH_ENV( + "RcclUpdateThreadThreshold_MaxChannelsSet", + []() + { + const char* value = getenv("NCCL_MAX_NCHANNELS"); + if(!value) + { + INFO( + NCCL_LOG_INFO, + "[Rcclwrap] Test skipped. Set environment " + "variable NCCL_MAX_NCHANNELS" + ); + GTEST_SKIP() << "[Rcclwrap] Test skipped. Set environment variable " + "NCCL_MAX_NCHANNELS\n"; + } + else + { + ncclComm comm{}; + ncclTaskColl info{}; + int threadThreshold = 5; + + comm.nRanks = 4; + comm.nNodes = 4; + info.func = ncclFuncAllGather; + info.protocol = 0; + memset(comm.minMaxLLRange, 0, sizeof(comm.minMaxLLRange)); + + rcclUpdateThreadThreshold(&comm, 0, &info, threadThreshold); + + EXPECT_EQ(threadThreshold, 5); + } + }, + {{"NCCL_MAX_NCHANNELS", "1"}} + ); +} + +TEST(Rcclwrap, RcclUpdateThreadThreshold_NoEnv_nNodesLessThan2) +{ + ncclComm comm{}; + ncclTaskColl info{}; + int threadThreshold = 5; + + comm.nRanks = 4; + comm.nNodes = 1; // less than 2 + info.func = ncclFuncReduceScatter; + info.protocol = 0; memset(comm.minMaxLLRange, 0, sizeof(comm.minMaxLLRange)); - int threadThreshold = 5; // Any number should do, we should make sure this - // number does not change rcclUpdateThreadThreshold(&comm, 0, &info, threadThreshold); - EXPECT_EQ(threadThreshold, 5); - } + EXPECT_EQ(threadThreshold, 5); // no change } -TEST(Rcclwrap, RcclUpdateThreadThreshold_MinNChannelsSet) { - const char *value = getenv("NCCL_MIN_NCHANNELS"); - if (!value) { - INFO( - NCCL_LOG_INFO, - "[Rcclwrap] Test skipped. Set environment variable NCCL_MIN_NCHANNELS"); - GTEST_SKIP() << "[Rcclwrap] Test skipped. Set environment variable " - "NCCL_MIN_NCHANNELS\n"; - } else { - ncclComm comm{}; +TEST(Rcclwrap, RcclUpdateThreadThreshold_NoEnv_FuncUnsupported) +{ + ncclComm comm{}; ncclTaskColl info{}; - int threadThreshold = 5; + int threadThreshold = 5; - comm.nRanks = 4; - comm.nNodes = 4; - info.func = ncclFuncAllGather; + comm.nRanks = 4; + comm.nNodes = 2; + info.func = ncclFuncAllReduce; // unsupported func info.protocol = 0; memset(comm.minMaxLLRange, 0, sizeof(comm.minMaxLLRange)); rcclUpdateThreadThreshold(&comm, 0, &info, threadThreshold); EXPECT_EQ(threadThreshold, 5); - } } -TEST(Rcclwrap, RcclUpdateThreadThreshold_MNChannelsSet) { - const char *value = getenv("NCCL_MAX_NCHANNELS"); - if (!value) { - INFO( - NCCL_LOG_INFO, - "[Rcclwrap] Test skipped. Set environment variable NCCL_MAX_NCHANNELS"); - GTEST_SKIP() << "[Rcclwrap] Test skipped. Set environment variable " - "NCCL_MAX_NCHANNELS\n"; - } else { - ncclComm comm{}; +TEST(Rcclwrap, RcclUpdateThreadThreshold_NoEnv_UpdateOccurs) +{ + ncclComm comm{}; ncclTaskColl info{}; - int threadThreshold = 5; + int threadThreshold = 5; - comm.nRanks = 4; - comm.nNodes = 4; - info.func = ncclFuncAllGather; + comm.nRanks = 4; + comm.nNodes = 2; + info.func = ncclFuncReduceScatter; info.protocol = 0; memset(comm.minMaxLLRange, 0, sizeof(comm.minMaxLLRange)); + int idx = rcclGetTunableIndex(info.func); + comm.minMaxLLRange[idx][info.protocol][RCCL_PROTOCOL_THREAD_THRESHOLD_IDX] = 10; + + rcclUpdateThreadThreshold(&comm, 0, &info, threadThreshold); + + EXPECT_EQ(threadThreshold, 40); // 10 * 4 +} + +TEST(Rcclwrap, RcclUpdateThreadThreshold_NoEnv_ThresholdUndefined) +{ + ncclComm comm{}; + ncclTaskColl info{}; + int threadThreshold = 5; + + comm.nRanks = 4; + comm.nNodes = 3; + info.func = ncclFuncAllGather; + info.protocol = 0; + memset(comm.minMaxLLRange, 0, sizeof(comm.minMaxLLRange)); + + int idx = rcclGetTunableIndex(info.func); + comm.minMaxLLRange[idx][info.protocol][RCCL_PROTOCOL_THREAD_THRESHOLD_IDX] + = RCCL_LL_LIMITS_UNDEFINED; + rcclUpdateThreadThreshold(&comm, 0, &info, threadThreshold); EXPECT_EQ(threadThreshold, 5); - } } -TEST(Rcclwrap, RcclUpdateThreadThreshold_NoEnv_nNodesLessThan2) { - ncclComm comm{}; - ncclTaskColl info{}; - int threadThreshold = 5; +TEST(Rcclwrap, RcclSetPipelining_Invalid_DType) +{ + // Skip the test if pipelining has been disabled + // (RCCL_DISABLE_REDUCE_COPY_PIPELINING=1) + if(ShouldSkipRcclSetPipeliningTests()) + { + GTEST_SKIP() << "Skipping test: RCCL_DISABLE_REDUCE_COPY_PIPELINING environment " + "variable is set. Unset this variable to enable pipelining."; + } - comm.nRanks = 4; - comm.nNodes = 1; // less than 2 - info.func = ncclFuncReduceScatter; - info.protocol = 0; - memset(comm.minMaxLLRange, 0, sizeof(comm.minMaxLLRange)); + // Skip the test if pipelining has been enabled for all data types + // (RCCL_PIPELINE_ALL_DATA_TYPES=1) + const char* allowAllDTypes = getenv("RCCL_PIPELINE_ALL_DATA_TYPES"); + if(allowAllDTypes && strcmp(allowAllDTypes, "0") != 0) + { + GTEST_SKIP() << "Skipping test: RCCL_PIPELINE_ALL_DATA_TYPES environment " + "variable is set. Unset this variable to enable pipelining " + "only for bf16 data type."; + } - rcclUpdateThreadThreshold(&comm, 0, &info, threadThreshold); + // Pipeline should not be set for non-bf16 datatypes, unless + // rcclParamPipelineAllDTypes() returns true + ncclComm_t comm = nullptr; + struct ncclTopoSystem topo; + struct ncclTopoNode gpu; + CreateMockComm(comm, topo, gpu, "gfx950", 8); + comm->nNodes = 2; // Multi node - EXPECT_EQ(threadThreshold, 5); // no change + ncclTaskColl info = {}; + info.func = ncclFuncAllReduce; + info.datatype = ncclFloat32; + + size_t nBytes = 16 * 1024 * 1024; // 16MB + rcclSetPipelining(comm, nBytes, &info); + + EXPECT_EQ(info.pipeline, 0) << "Non-bf16 should not set pipeline by default"; + + CleanupMockComm(comm); } -TEST(Rcclwrap, RcclUpdateThreadThreshold_NoEnv_FuncUnsupported) { - ncclComm comm{}; - ncclTaskColl info{}; - int threadThreshold = 5; +TEST(Rcclwrap, RcclSetPipelining_GFX950_SingleNode_Disable) +{ + // Skip the test if pipelining has been disabled + // (RCCL_DISABLE_REDUCE_COPY_PIPELINING=1) + if(ShouldSkipRcclSetPipeliningTests()) + { + GTEST_SKIP() << "Skipping test: RCCL_DISABLE_REDUCE_COPY_PIPELINING environment " + "variable is set. Unset this variable to enable pipelining."; + } - comm.nRanks = 4; - comm.nNodes = 2; - info.func = ncclFuncAllReduce; // unsupported func - info.protocol = 0; - memset(comm.minMaxLLRange, 0, sizeof(comm.minMaxLLRange)); + // For single-node, pipeline remains 0 + ncclComm_t comm = nullptr; + struct ncclTopoSystem topo; + struct ncclTopoNode gpu; + CreateMockComm(comm, topo, gpu, "gfx950", 8); + comm->nNodes = 1; // Single node - rcclUpdateThreadThreshold(&comm, 0, &info, threadThreshold); + ncclTaskColl info = {}; + // In rcclSetPipelining(), ncclFuncAllReduce, ncclFuncReduceScatter, and + // ncclFuncReduce share the same case body. Testing any one of them is + // sufficient to validate that code path. + info.func = ncclFuncAllReduce; + info.datatype = ncclBfloat16; - EXPECT_EQ(threadThreshold, 5); + size_t nBytes = 16 * 1024 * 1024; // 16MB + rcclSetPipelining(comm, nBytes, &info); + + EXPECT_EQ(info.pipeline, 0) << "gfx950 single-node should not enable pipelining"; + + CleanupMockComm(comm); } -TEST(Rcclwrap, RcclUpdateThreadThreshold_NoEnv_UpdateOccurs) { - ncclComm comm{}; - ncclTaskColl info{}; - int threadThreshold = 5; +TEST(Rcclwrap, RcclSetPipelining_GFX942_SingleNode_AllReduce_Enable) +{ + // Skip the test if pipelining has been disabled + // (RCCL_DISABLE_REDUCE_COPY_PIPELINING=1) + if(ShouldSkipRcclSetPipeliningTests()) + { + GTEST_SKIP() << "Skipping test: RCCL_DISABLE_REDUCE_COPY_PIPELINING environment " + "variable is set. Unset this variable to enable pipelining."; + } - comm.nRanks = 4; - comm.nNodes = 2; - info.func = ncclFuncReduceScatter; - info.protocol = 0; - memset(comm.minMaxLLRange, 0, sizeof(comm.minMaxLLRange)); + // For single-node, pipeline is set to 1 for AllReduce with bf16 + ncclComm_t comm = nullptr; + struct ncclTopoSystem topo; + struct ncclTopoNode gpu; + CreateMockComm(comm, topo, gpu, "gfx942", 8); + comm->nNodes = 1; // Single node - int idx = rcclGetTunableIndex(info.func); - comm.minMaxLLRange[idx][info.protocol][RCCL_PROTOCOL_THREAD_THRESHOLD_IDX] = - 10; + ncclTaskColl info = {}; + info.func = ncclFuncAllReduce; + info.datatype = ncclBfloat16; - rcclUpdateThreadThreshold(&comm, 0, &info, threadThreshold); + size_t nBytes = 16 * 1024 * 1024; // 16MB + rcclSetPipelining(comm, nBytes, &info); - EXPECT_EQ(threadThreshold, 40); // 10 * 4 + EXPECT_EQ(info.pipeline, 1) << "gfx942 single-node AllReduce bf16 should enable pipelining"; + + CleanupMockComm(comm); } -TEST(Rcclwrap, RcclUpdateThreadThreshold_NoEnv_ThresholdUndefined) { - ncclComm comm{}; - ncclTaskColl info{}; - int threadThreshold = 5; +TEST(Rcclwrap, RcclSetPipelining_GFX942_MultiNode_AllReduce_Enable) +{ + // Skip the test if pipelining has been disabled + // (RCCL_DISABLE_REDUCE_COPY_PIPELINING=1) + if(ShouldSkipRcclSetPipeliningTests()) + { + GTEST_SKIP() << "Skipping test: RCCL_DISABLE_REDUCE_COPY_PIPELINING environment " + "variable is set. Unset this variable to enable pipelining."; + } - comm.nRanks = 4; - comm.nNodes = 3; - info.func = ncclFuncAllGather; - info.protocol = 0; - memset(comm.minMaxLLRange, 0, sizeof(comm.minMaxLLRange)); + // For multi-node AllReduce with bf16, pipelining is enabled if + // nBytes <= 512MB * 2^(log2(nNodes)-1) + // Testing with nNodes = 4 => threshold = 512MB * 2^(2-1) = 1GB + ncclComm_t comm = nullptr; + struct ncclTopoSystem topo; + struct ncclTopoNode gpu; + CreateMockComm(comm, topo, gpu, "gfx942", 8); + comm->nNodes = 4; - int idx = rcclGetTunableIndex(info.func); - comm.minMaxLLRange[idx][info.protocol][RCCL_PROTOCOL_THREAD_THRESHOLD_IDX] = - RCCL_LL_LIMITS_UNDEFINED; + ncclTaskColl info = {}; + info.func = ncclFuncAllReduce; + info.datatype = ncclBfloat16; - rcclUpdateThreadThreshold(&comm, 0, &info, threadThreshold); + size_t nBytes = (1ULL << 30); // 1GB, exactly at threshold + rcclSetPipelining(comm, nBytes, &info); - EXPECT_EQ(threadThreshold, 5); + EXPECT_EQ(info.pipeline, 1) << "gfx942 4-node AllReduce at threshold should enable pipelining"; + + CleanupMockComm(comm); } -TEST(Rcclwrap, GFX942_SmallRanks) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("GFX942_SmallRanks")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } +TEST(Rcclwrap, RcclSetPipelining_GFX942_MultiNode_AllReduce_Disable) +{ + // Skip the test if pipelining has been disabled + // (RCCL_DISABLE_REDUCE_COPY_PIPELINING=1) + if(ShouldSkipRcclSetPipeliningTests()) + { + GTEST_SKIP() << "Skipping test: RCCL_DISABLE_REDUCE_COPY_PIPELINING environment " + "variable is set. Unset this variable to enable pipelining."; + } - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } + // When nBytes is just above the threshold, pipelining should be disabled + ncclComm_t comm = nullptr; + struct ncclTopoSystem topo; + struct ncclTopoNode gpu; + CreateMockComm(comm, topo, gpu, "gfx942", 8); + comm->nNodes = 4; - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize for GFX942 with small ranks"); + ncclTaskColl info = {}; + info.func = ncclFuncAllReduce; + info.datatype = ncclBfloat16; - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx942", 32); + size_t nBytes = (1ULL << 30) + 1024; // 1GB + 1KB, just above threshold + rcclSetPipelining(comm, nBytes, &info); - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); + EXPECT_EQ(info.pipeline, 0) + << "gfx942 4-node AllReduce above threshold should disable pipelining"; - // Expected: 1 << 17 = 131072 for ranks < 64 - EXPECT_EQ(chunkSize, 1 << 17) - << "GFX942 with ranks < 64 should set chunk size to 131072"; - - INFO(NCCL_LOG_INFO, "GFX942 small ranks test completed - chunk size: %d", - chunkSize); - - CleanupMockComm(mockComm); + CleanupMockComm(comm); } -TEST(Rcclwrap, GFX942_LargeRanks) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("GFX942_LargeRanks")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } +TEST(Rcclwrap, RcclSetPipelining_GFX942_Enable) +{ + // Skip the test if pipelining has been disabled + // (RCCL_DISABLE_REDUCE_COPY_PIPELINING=1) + if(ShouldSkipRcclSetPipeliningTests()) + { + GTEST_SKIP() << "Skipping test: RCCL_DISABLE_REDUCE_COPY_PIPELINING environment " + "variable is set. Unset this variable to enable pipelining."; + } - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } + // ReduceScatter & Reduce should enable pipelining regardless of no. of nodes + ncclComm_t comm = nullptr; + struct ncclTopoSystem topo; + struct ncclTopoNode gpu; + CreateMockComm(comm, topo, gpu, "gfx942", 8); + comm->nNodes = 8; - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize for GFX942 with large ranks"); + ncclTaskColl info = {}; + // In rcclSetPipelining(), ncclFuncReduceScatter, and + // ncclFuncReduce share the same case body. Testing any one of them is + // sufficient to validate that code path. + info.func = ncclFuncReduceScatter; + info.datatype = ncclBfloat16; - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx942", 128); + size_t nBytes = 16 * 1024 * 1024; // 16MB + rcclSetPipelining(comm, nBytes, &info); - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); + EXPECT_EQ(info.pipeline, 1) << "gfx942 ReduceScatter and Reduce should enable " + "pipelining with single or multi-node"; - // Expected: 1 << 19 = 524288 for ranks >= 64 - EXPECT_EQ(chunkSize, 1 << 19) - << "GFX942 with ranks >= 64 should set chunk size to 524288"; - - INFO(NCCL_LOG_INFO, "GFX942 large ranks test completed - chunk size: %d", - chunkSize); - - CleanupMockComm(mockComm); + CleanupMockComm(comm); } -TEST(Rcclwrap, GFX942_BoundaryRank64) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("GFX942_BoundaryRank64")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } +TEST(Rcclwrap, RcclOverrideProtocol_NoOverride) +{ + RUN_ISOLATED_TEST_WITH_ENV("RcclOverrideProtocol_NoOverride", + []() { + float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + ncclTaskColl info; - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } + ncclResult_t result = rcclOverrideProtocol(ncclProtoStr, table, &info); - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize for GFX942 with boundary rank 64"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx942", 64); - - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); - - // Expected: 1 << 19 = 524288 for ranks >= 64 - EXPECT_EQ(chunkSize, 1 << 19) - << "GFX942 with ranks = 64 should set chunk size to 524288"; - - INFO(NCCL_LOG_INFO, "GFX942 boundary rank 64 test completed - chunk size: %d", - chunkSize); - - CleanupMockComm(mockComm); + EXPECT_EQ(result, ncclSuccess) + << "Expected ncclSuccess when RCCL_OVERRIDE_PROTO is unset, indicating " + "no override should be applied."; + }, + {} + ); } -TEST(Rcclwrap, GFX942_BoundaryRank63) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("GFX942_BoundaryRank63")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } +TEST(Rcclwrap, RcclOverrideProtocol_UnsupportedOverride) +{ + RUN_ISOLATED_TEST_WITH_ENV("RcclOverrideProtocol_UnsupportedOverride", + []() { + // Mark all combinations as unsupported for the purpose of this test. + float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + for(int a = 0; a < NCCL_NUM_ALGORITHMS; ++a) + for(int p = 0; p < NCCL_NUM_PROTOCOLS; ++p) + table[a][p] = NCCL_ALGO_PROTO_IGNORE; - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } + ncclTaskColl info; + info.func = ncclFuncReduceScatter; + info.datatype = ncclBfloat16; + info.algorithm = NCCL_ALGO_RING; - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize for GFX942 with boundary rank 63"); + ncclResult_t result = rcclOverrideProtocol(ncclProtoStr, table, &info); - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx942", 63); - - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); - - // Expected: 1 << 17 = 131072 for ranks < 64 - EXPECT_EQ(chunkSize, 1 << 17) - << "GFX942 with ranks = 63 should set chunk size to 131072"; - - INFO(NCCL_LOG_INFO, "GFX942 boundary rank 63 test completed - chunk size: %d", - chunkSize); - - CleanupMockComm(mockComm); + EXPECT_EQ(result, ncclInternalError) + << "Expected ncclInternalError when the override protocol is valid, but " + "not enabled for the selected algorithm."; + }, + {{"RCCL_OVERRIDE_PROTO", "Simple"}} + ); } -TEST(Rcclwrap, GFX950_SmallRanks) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("GFX950_SmallRanks")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } +TEST(Rcclwrap, RcclOverrideProtocol_ValidOverride) +{ + RUN_ISOLATED_TEST_WITH_ENV("RcclOverrideProtocol_ValidOverride", + []() { + const char* protoOverrideEnv = getenv("RCCL_OVERRIDE_PROTO"); + ASSERT_NE(protoOverrideEnv, nullptr) << "RCCL_OVERRIDE_PROTO should be set"; - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } + // Get the index of the protocol from the string for later comparison + int protoIndex = NCCL_PROTO_UNDEF; + ncclResult_t idxResult + = rcclGetAlgoProtoIndex(protoOverrideEnv, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoIndex); + ASSERT_EQ(idxResult, ncclSuccess) << "Failed to get protocol index from string"; - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize for GFX950 with small ranks"); + // Mark all combinations as valid for the purpose of this test. + float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + for(int a = 0; a < NCCL_NUM_ALGORITHMS; ++a) + for(int p = 0; p < NCCL_NUM_PROTOCOLS; ++p) + table[a][p] = 0.0; - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx950", 8); + ncclTaskColl info; + info.func = ncclFuncAllReduce; + info.datatype = ncclBfloat16; + info.algorithm = NCCL_ALGO_RING; + info.protocol = NCCL_PROTO_UNDEF; - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); + ncclResult_t result = rcclOverrideProtocol(ncclProtoStr, table, &info); - // Expected: 1 << 17 = 131072 for ranks < 16 - EXPECT_EQ(chunkSize, 1 << 17) - << "GFX950 with ranks < 16 should set chunk size to 131072"; - - INFO(NCCL_LOG_INFO, "GFX950 small ranks test completed - chunk size: %d", - chunkSize); - - CleanupMockComm(mockComm); + EXPECT_EQ(result, ncclSuccess) << "Expected ncclSuccess when override is applied successfully."; + EXPECT_EQ(info.protocol, protoIndex) << "Protocol index should match the " + "override value from environment."; + }, + {{"RCCL_OVERRIDE_PROTO", "Simple"}} + ); } -TEST(Rcclwrap, GFX950_MediumRanks) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("GFX950_MediumRanks")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } +TEST(Rcclwrap, RcclOverrideProtocol_ValidOverridePersists) +{ + RUN_ISOLATED_TEST_WITH_ENV("RcclOverrideProtocol_ValidOverridePersists", + []() { + const char* protoOverrideEnv = getenv("RCCL_OVERRIDE_PROTO"); + ASSERT_NE(protoOverrideEnv, nullptr) << "RCCL_OVERRIDE_PROTO should be set"; - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } + // Get the index of the protocol from the string for later comparison + int protoIndex = NCCL_PROTO_UNDEF; + ncclResult_t idxResult + = rcclGetAlgoProtoIndex(protoOverrideEnv, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoIndex); + ASSERT_EQ(idxResult, ncclSuccess) << "Failed to get protocol index from string"; - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize for GFX950 with medium ranks"); + // Mark all combinations as valid for the purpose of this test. + float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + for(int a = 0; a < NCCL_NUM_ALGORITHMS; ++a) + for(int p = 0; p < NCCL_NUM_PROTOCOLS; ++p) + table[a][p] = 0.0; - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx950", 24); + ncclTaskColl info; + info.func = ncclFuncAllReduce; + info.datatype = ncclFloat16; + info.algorithm = NCCL_ALGO_RING; + info.protocol = NCCL_PROTO_UNDEF; - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); + // First call + ncclResult_t result1 = rcclOverrideProtocol(ncclProtoStr, table, &info); + EXPECT_EQ(result1, ncclSuccess) + << "Expected rcclOverrideProtocol to succeed with valid override"; + EXPECT_EQ(info.protocol, protoIndex) << "Expected protocol to match override after first call"; - // Expected: 1 << 18 = 262144 for 16 <= ranks < 32 - EXPECT_EQ(chunkSize, 1 << 18) - << "GFX950 with 16 <= ranks < 32 should set chunk size to 262144"; - - INFO(NCCL_LOG_INFO, "GFX950 medium ranks test completed - chunk size: %d", - chunkSize); - - CleanupMockComm(mockComm); + // Second call + ncclResult_t result2 = rcclOverrideProtocol(ncclProtoStr, table, &info); + EXPECT_EQ(result2, ncclSuccess) + << "Expected rcclOverrideProtocol to succeed again on second call"; + EXPECT_EQ(info.protocol, protoIndex) << "Expected protocol to match override after second call"; + }, + {{"RCCL_OVERRIDE_PROTO", "Simple"}} + ); } -TEST(Rcclwrap, GFX950_LargeRanks) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("GFX950_LargeRanks")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } +TEST(Rcclwrap, RcclOverrideProtocol_InvalidProtocol) +{ + RUN_ISOLATED_TEST_WITH_ENV("RcclOverrideProtocol_InvalidProtocol", + []() { + float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + ncclTaskColl info; - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } + ncclResult_t result = rcclOverrideProtocol(ncclProtoStr, table, &info); - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize for GFX950 with large ranks"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx950", 64); - - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); - - // Expected: 1 << 19 = 524288 for ranks >= 32 - EXPECT_EQ(chunkSize, 1 << 19) - << "GFX950 with ranks >= 32 should set chunk size to 524288"; - - INFO(NCCL_LOG_INFO, "GFX950 large ranks test completed - chunk size: %d", - chunkSize); - - CleanupMockComm(mockComm); + EXPECT_EQ(result, ncclInvalidUsage) << "Expected ncclInvalidUsage when the " + "override protocol is invalid."; + }, + {{"RCCL_OVERRIDE_PROTO", "InvalidProtocol"}} + ); } -TEST(Rcclwrap, GFX950_BoundaryRank16) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("GFX950_BoundaryRank16")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } +TEST(Rcclwrap, RcclOverrideProtocol_InvalidOverridePersists) +{ + RUN_ISOLATED_TEST_WITH_ENV("RcclOverrideProtocol_InvalidOverridePersists", + []() { + float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + ncclTaskColl info; - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } + // First call should fail due to invalid proto string + ncclResult_t result1 = rcclOverrideProtocol(ncclProtoStr, table, &info); + EXPECT_EQ(result1, ncclInvalidUsage) << "Expected rcclOverrideProtocol to fail with invalid " + "RCCL_OVERRIDE_PROTO."; - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize for GFX950 with boundary rank 16"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx950", 16); - - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); - - // Expected: 1 << 18 = 262144 for ranks >= 16 - EXPECT_EQ(chunkSize, 1 << 18) - << "GFX950 with ranks = 16 should set chunk size to 262144"; - - INFO(NCCL_LOG_INFO, "GFX950 boundary rank 16 test completed - chunk size: %d", - chunkSize); - - CleanupMockComm(mockComm); + // Second call should still fail because the static variable disables further + // overrides + ncclResult_t result2 = rcclOverrideProtocol(ncclProtoStr, table, &info); + EXPECT_EQ(result2, ncclInvalidUsage) + << "Expected rcclOverrideProtocol to continue returning failure after " + "invalid proto was set."; + }, + {{"RCCL_OVERRIDE_PROTO", "InvalidProtocol"}} + ); } -TEST(Rcclwrap, GFX950_BoundaryRank15) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("GFX950_BoundaryRank15")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } +TEST(Rcclwrap, RcclOverrideAlgorithm_NoOverride) +{ + RUN_ISOLATED_TEST_WITH_ENV("RcclOverrideAlgorithm_NoOverride", + []() { + float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + ncclTaskColl info; - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } + ncclResult_t result = rcclOverrideAlgorithm(ncclAlgoStr, table, &info); - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize for GFX950 with boundary rank 15"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx950", 15); - - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); - - // Expected: 1 << 17 = 131072 for ranks < 16 - EXPECT_EQ(chunkSize, 1 << 17) - << "GFX950 with ranks = 15 should set chunk size to 131072"; - - INFO(NCCL_LOG_INFO, "GFX950 boundary rank 15 test completed - chunk size: %d", - chunkSize); - - CleanupMockComm(mockComm); + // Since no override is set, it should return success and do nothing + EXPECT_EQ(result, ncclSuccess) + << "Expected ncclSuccess when RCCL_OVERRIDE_ALGO is unset, indicating no " + "override should be applied."; + }, + {} + ); } -TEST(Rcclwrap, GFX950_BoundaryRank32) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("GFX950_BoundaryRank32")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } +TEST(Rcclwrap, RcclOverrideAlgorithm_UnsupportedOverride) +{ + RUN_ISOLATED_TEST_WITH_ENV("RcclOverrideAlgorithm_UnsupportedOverride", + []() { + float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + for(int a = 0; a < NCCL_NUM_ALGORITHMS; ++a) + for(int p = 0; p < NCCL_NUM_PROTOCOLS; ++p) + table[a][p] = NCCL_ALGO_PROTO_IGNORE; - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } + ncclTaskColl info; + info.func = ncclFuncReduceScatter; + info.datatype = ncclBfloat16; + info.protocol = NCCL_PROTO_SIMPLE; - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize for GFX950 with boundary rank 32"); + ncclResult_t result = rcclOverrideAlgorithm(ncclAlgoStr, table, &info); - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx950", 32); - - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); - - // Expected: 1 << 19 = 524288 for ranks >= 32 - EXPECT_EQ(chunkSize, 1 << 19) - << "GFX950 with ranks = 32 should set chunk size to 524288"; - - INFO(NCCL_LOG_INFO, "GFX950 boundary rank 32 test completed - chunk size: %d", - chunkSize); - - CleanupMockComm(mockComm); + EXPECT_EQ(result, ncclInternalError) + << "Expected ncclInternalError when the override algorithm is valid, but " + "not enabled for the selected protocol."; + }, + {{"RCCL_OVERRIDE_ALGO", "Ring"}} + ); } -TEST(Rcclwrap, GFX950_BoundaryRank31) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("GFX950_BoundaryRank31")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } +TEST(Rcclwrap, RcclOverrideAlgorithm_ValidOverride) +{ + RUN_ISOLATED_TEST_WITH_ENV("RcclOverrideAlgorithm_ValidOverride", + []() { + const char* algoOverrideEnv = getenv("RCCL_OVERRIDE_ALGO"); + ASSERT_NE(algoOverrideEnv, nullptr) << "RCCL_OVERRIDE_ALGO should be set"; - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } + // Get the index of the algorithm from the string for later comparison + int algoIndex = NCCL_ALGO_UNDEF; + ncclResult_t idxResult + = rcclGetAlgoProtoIndex(algoOverrideEnv, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoIndex); + ASSERT_EQ(idxResult, ncclSuccess) << "Failed to get algorithm index from string"; - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize for GFX950 with boundary rank 31"); + float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + // Mark all combinations as valid for the purpose of this test. + for(int a = 0; a < NCCL_NUM_ALGORITHMS; ++a) + for(int p = 0; p < NCCL_NUM_PROTOCOLS; ++p) + table[a][p] = 0.0; - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx950", 31); + ncclTaskColl info; + info.func = ncclFuncAllReduce; + info.datatype = ncclBfloat16; + info.protocol = NCCL_PROTO_SIMPLE; + info.algorithm = NCCL_ALGO_UNDEF; - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); + ncclResult_t result = rcclOverrideAlgorithm(ncclAlgoStr, table, &info); - // Expected: 1 << 18 = 262144 for 16 <= ranks < 32 - EXPECT_EQ(chunkSize, 1 << 18) - << "GFX950 with ranks = 31 should set chunk size to 262144"; - - INFO(NCCL_LOG_INFO, "GFX950 boundary rank 31 test completed - chunk size: %d", - chunkSize); - - CleanupMockComm(mockComm); + EXPECT_EQ(result, ncclSuccess) << "Expected ncclSuccess when override is applied successfully."; + EXPECT_EQ(info.algorithm, algoIndex) + << "Algorithm index should match the override value from environment."; + }, + {{"RCCL_OVERRIDE_ALGO", "Ring"}} + ); } -TEST(Rcclwrap, UnsupportedArch_GFX908) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("UnsupportedArch_GFX908")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } +TEST(Rcclwrap, RcclOverrideAlgorithm_ValidOverridePersists) +{ + RUN_ISOLATED_TEST_WITH_ENV("RcclOverrideAlgorithm_ValidOverridePersists", + []() { + const char* algoOverrideEnv = getenv("RCCL_OVERRIDE_ALGO"); + ASSERT_NE(algoOverrideEnv, nullptr) << "RCCL_OVERRIDE_ALGO should be set"; - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } + // Get the index of the algorithm from the string for later comparison + int algoIndex = NCCL_ALGO_UNDEF; + ncclResult_t idxResult + = rcclGetAlgoProtoIndex(algoOverrideEnv, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoIndex); + ASSERT_EQ(idxResult, ncclSuccess) << "Failed to get algorithm index from string"; - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize for unsupported architecture GFX908"); + // Mark all combinations as valid for the purpose of this test. + float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + for(int a = 0; a < NCCL_NUM_ALGORITHMS; ++a) + for(int p = 0; p < NCCL_NUM_PROTOCOLS; ++p) + table[a][p] = 0.0; - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx908", 32); + ncclTaskColl info; + info.func = ncclFuncAllReduce; + info.datatype = ncclFloat16; + info.protocol = NCCL_PROTO_SIMPLE; + info.algorithm = NCCL_ALGO_UNDEF; - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); + // First call + ncclResult_t result1 = rcclOverrideAlgorithm(ncclAlgoStr, table, &info); + EXPECT_EQ(result1, ncclSuccess) + << "Expected rcclOverrideAlgorithm to succeed with valid override."; + EXPECT_EQ(info.algorithm, algoIndex) + << "Expected algorithm to match override after first call."; - // Expected: RCCL_VALUE_INVALID for unsupported architectures - EXPECT_EQ(chunkSize, RCCL_VALUE_INVALID) - << "Unsupported architecture GFX908 should set chunk size to " - "RCCL_VALUE_INVALID"; - - INFO(NCCL_LOG_INFO, - "Unsupported architecture GFX908 test completed - chunk size: %d", - chunkSize); - - CleanupMockComm(mockComm); + // Second call + ncclResult_t result2 = rcclOverrideAlgorithm(ncclAlgoStr, table, &info); + EXPECT_EQ(result2, ncclSuccess) + << "Expected rcclOverrideAlgorithm to succeed again on second call."; + EXPECT_EQ(info.algorithm, algoIndex) + << "Expected algorithm to match override after second call."; + }, + {{"RCCL_OVERRIDE_ALGO", "Ring"}} + ); } -TEST(Rcclwrap, UnsupportedArch_GFX90A) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("UnsupportedArch_GFX90A")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } +TEST(Rcclwrap, RcclOverrideAlgorithm_InvalidAlgorithm) +{ + RUN_ISOLATED_TEST_WITH_ENV("RcclOverrideAlgorithm_InvalidAlgorithm", + []() { + float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + ncclTaskColl info; - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } + ncclResult_t result = rcclOverrideAlgorithm(ncclAlgoStr, table, &info); - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize for unsupported architecture GFX90A"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx90a", 32); - - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); - - // Expected: RCCL_VALUE_INVALID for unsupported architectures - EXPECT_EQ(chunkSize, RCCL_VALUE_INVALID) - << "Unsupported architecture GFX90A should set chunk size to " - "RCCL_VALUE_INVALID"; - - INFO(NCCL_LOG_INFO, - "Unsupported architecture GFX90A test completed - chunk size: %d", - chunkSize); - - CleanupMockComm(mockComm); + EXPECT_EQ(result, ncclInvalidUsage) + << "Expected ncclInvalidUsage when the override algorithm is invalid."; + }, + {{"RCCL_OVERRIDE_ALGO", "InvalidAlgorithm"}} + ); } -// This test specifically tests the environment variable behavior -TEST(Rcclwrap, WithEnvironmentVariable) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("WithEnvironmentVariable")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } +TEST(Rcclwrap, RcclOverrideAlgorithm_InvalidOverridePersists) +{ + RUN_ISOLATED_TEST_WITH_ENV("RcclOverrideAlgorithm_InvalidOverridePersists", + []() { + float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + ncclTaskColl info; - // This test requires environment variable to be set to a specific value - if (ShouldSkipP2pTest("123456")) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is not " - "set to '123456'. " - << "Please set: export NCCL_P2P_NET_CHUNKSIZE=123456 to run this test. " - << "This test verifies that user override via environment variable " - "works correctly."; - } + // First call should fail due to invalid algo string (and set the static flag) + ncclResult_t result1 = rcclOverrideAlgorithm(ncclAlgoStr, table, &info); + EXPECT_EQ(result1, ncclInvalidUsage) << "Expected rcclOverrideAlgorithm to fail with invalid " + "RCCL_OVERRIDE_ALGO."; - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize with environment variable set"); - - // Environment variable is confirmed to be set to "123456" - const char *envVar = getenv("NCCL_P2P_NET_CHUNKSIZE"); - INFO(NCCL_LOG_INFO, "Environment variable found: NCCL_P2P_NET_CHUNKSIZE=%s", - envVar); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx942", 32); - - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); - - // Expected: RCCL_VALUE_INVALID when environment variable is set (user - // override) - EXPECT_EQ(chunkSize, RCCL_VALUE_INVALID) - << "When env var is set, should return RCCL_VALUE_INVALID"; - - INFO(NCCL_LOG_INFO, "Environment variable test completed - chunk size: %d", - chunkSize); - INFO(NCCL_LOG_INFO, - "User override via NCCL_P2P_NET_CHUNKSIZE=%s was respected", envVar); - - CleanupMockComm(mockComm); + // Second call should also fail due to static validInput=false + ncclResult_t result2 = rcclOverrideAlgorithm(ncclAlgoStr, table, &info); + EXPECT_EQ(result2, ncclInvalidUsage) + << "Expected rcclOverrideAlgorithm to continue returning failure after " + "invalid algo was set."; + }, + {{"RCCL_OVERRIDE_ALGO", "InvalidAlgorithm"}} + ); } -TEST(Rcclwrap, EmptyArchString) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("EmptyArchString")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } +TEST(Rcclwrap, AllrcclSetP2pNetChunkSizeTests) +{ + INFO( + NCCL_LOG_INFO, + "=== Starting Process-Isolated rcclSetP2pNetChunkSize " + "Tests Execution ===" + ); - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } + // Define test case structure + struct P2PChunkSizeTestCase + { + std::string name; + std::string arch; + int ranks; + int expectedChunkSize; + std::unordered_map extraEnv; + }; - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize with empty architecture string"); + // Define all test cases + std::vector testCases = { + // GFX942 tests + { "GFX942_LargeRanks_Isolated","gfx942", 128,1 << 19, {} }, + { "GFX942_BoundaryRank64_Isolated", "gfx942", 64, 1 << 19, {}}, + { "GFX942_BoundaryRank63_Isolated", "gfx942", 63, 1 << 17, {}}, - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "", 32); + // GFX950 tests + { "GFX950_SmallRanks_Isolated", "gfx950", 8, 1 << 17, {}}, + { "GFX950_MediumRanks_Isolated", "gfx950", 24, 1 << 18, {}}, + { "GFX950_LargeRanks_Isolated", "gfx950", 64, 1 << 19, {}}, + { "GFX950_BoundaryRank16_Isolated", "gfx950", 16, 1 << 18, {}}, + { "GFX950_BoundaryRank15_Isolated", "gfx950", 15, 1 << 17, {}}, + { "GFX950_BoundaryRank32_Isolated", "gfx950", 32, 1 << 19, {}}, + { "GFX950_BoundaryRank31_Isolated", "gfx950", 31, 1 << 18, {}}, - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); + // Unsupported architectures + { "UnsupportedArch_GFX908_Isolated", "gfx908", 32, RCCL_VALUE_INVALID, {}}, + { "UnsupportedArch_GFX90A_Isolated", "gfx90a", 32, RCCL_VALUE_INVALID, {}}, - // Expected: RCCL_VALUE_INVALID for empty/invalid architecture - EXPECT_EQ(chunkSize, RCCL_VALUE_INVALID) - << "Empty architecture should set chunk size to RCCL_VALUE_INVALID"; + // Edge cases + { "EmptyArchString_Isolated", "", 32, RCCL_VALUE_INVALID, {}}, + { "PartialArchMatch_Isolated", "gfx94", 32, RCCL_VALUE_INVALID, {}}, + { "ZeroRanks_GFX942_Isolated", "gfx942", 0, 1 << 17, {}}, + { "ZeroRanks_GFX950_Isolated", "gfx950", 0, 1 << 17, {}}, + { "LargeRankValues_GFX950_Isolated", "gfx950", 1000, 1 << 19, {}}, + { "CaseInsensitiveArch_Isolated", "GFX942", 32, RCCL_VALUE_INVALID, {}}, - INFO(NCCL_LOG_INFO, "Empty architecture test completed - chunk size: %d", - chunkSize); + // Environment variable test + {"WithEnvironmentVariable_Isolated", + "gfx942", 32, + RCCL_VALUE_UNSET, {{"NCCL_P2P_NET_CHUNKSIZE", "123456"}, {"NCCL_MAX_NCHANNELS", "1"}} } + }; - CleanupMockComm(mockComm); + // Base environment for all tests + std::unordered_map baseEnv = { + { "NCCL_DEBUG", "TRACE"}, + {"NCCL_DEBUG_SUBSYS", "ALL"} + }; + + // Register all tests using a loop + for(const auto& tc : testCases) + { + ProcessIsolatedTestRunner::registerTest( + ProcessIsolatedTestRunner::TestConfig( + tc.name, + [tc]() + { + ncclComm_t mockComm = nullptr; + struct ncclTopoSystem mockTopo; + struct ncclTopoNode mockGpuNode; + CreateMockComm(mockComm, mockTopo, mockGpuNode, tc.arch.c_str(), tc.ranks); + + int chunkSize = RCCL_VALUE_UNSET; + rcclSetP2pNetChunkSize(mockComm, chunkSize); + + // Special handling for environment variable test + if(tc.name == "WithEnvironmentVariable_Isolated") + { + const char* envValue = getenv("NCCL_P2P_NET_CHUNKSIZE"); + EXPECT_STREQ(envValue, "123456") + << "Environment variable should be set to 123456"; + EXPECT_NE(chunkSize, RCCL_VALUE_UNSET) + << "Environment variable should override default logic"; + } + else + { + EXPECT_EQ(chunkSize, tc.expectedChunkSize) + << "Failed for " << tc.arch << " with " << tc.ranks << " ranks"; + } + + CleanupMockComm(mockComm); + } + ) + .withEnvironment( + [&tc, &baseEnv]() + { + auto env = baseEnv; + env.insert(tc.extraEnv.begin(), tc.extraEnv.end()); + return env; + }() + ) + .withTimeout(std::chrono::seconds(60)) + ); + } + + // Configure execution options + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = false; // Continue running all tests + options.verboseLogging = true; + + // Execute all tests + bool allTestsPassed = ProcessIsolatedTestRunner::executeAllTests(options); + + // Verify that all tests passed + EXPECT_TRUE(allTestsPassed) << "One or more process-isolated GFX tests failed"; + + INFO( + NCCL_LOG_INFO, + "=== Process-Isolated rcclSetP2pNetChunkSize Tests " + "Execution Completed ===" + ); } -TEST(Rcclwrap, PartialArchMatch) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("PartialArchMatch")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize with partial architecture match"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx94", 32); - - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); - - // Expected: RCCL_VALUE_INVALID for partial match - EXPECT_EQ(chunkSize, RCCL_VALUE_INVALID) - << "Partial architecture match should set chunk size to " - "RCCL_VALUE_INVALID"; - - INFO(NCCL_LOG_INFO, - "Partial architecture match test completed - chunk size: %d", chunkSize); - - CleanupMockComm(mockComm); -} - -TEST(Rcclwrap, ZeroRanks_GFX942) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("ZeroRanks_GFX942")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize with zero ranks for GFX942"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx942", 0); - - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); - - // Expected: 1 << 17 = 131072 (since 0 < 64) - EXPECT_EQ(chunkSize, 1 << 17) - << "Zero ranks should be treated as < 64, setting chunk size to 131072"; - - INFO(NCCL_LOG_INFO, "Zero ranks test completed - chunk size: %d", chunkSize); - - CleanupMockComm(mockComm); -} - -TEST(Rcclwrap, ZeroRanks_GFX950) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("ZeroRanks_GFX950")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize with zero ranks for GFX950"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx950", 0); - - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); - - // Expected: 1 << 17 = 131072 (since 0 < 16) - EXPECT_EQ(chunkSize, 1 << 17) - << "Zero ranks should be treated as < 16, setting chunk size to 131072"; - - INFO(NCCL_LOG_INFO, "Zero ranks GFX950 test completed - chunk size: %d", - chunkSize); - - CleanupMockComm(mockComm); -} - -TEST(Rcclwrap, LargeRankValues_GFX950) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("LargeRankValues_GFX950")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize with very large rank values for GFX950"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx950", 1000000); - - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); - - // Expected: 1 << 19 = 524288 (since 1000000 >= 32) - EXPECT_EQ(chunkSize, 1 << 19) << "Very large ranks should be treated as >= " - "32, setting chunk size to 524288"; - - INFO(NCCL_LOG_INFO, "Large rank values test completed - chunk size: %d", - chunkSize); - - CleanupMockComm(mockComm); -} - -TEST(Rcclwrap, CaseInsensitiveArch) { - // Check execution order first - if (ShouldSkipP2pTestDueToExecutionOrder("CaseInsensitiveArch")) { - GTEST_SKIP() << "Skipping due to execution order - another " - "rcclSetP2pNetChunkSize test already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipP2pTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_P2P_NET_CHUNKSIZE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, - "Testing rcclSetP2pNetChunkSize with case variations in architecture"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "GFX942", 32); // Uppercase - - int chunkSize = RCCL_VALUE_UNSET; - rcclSetP2pNetChunkSize(mockComm, chunkSize); - - // Expected: RCCL_VALUE_INVALID (case sensitive matching expected) - EXPECT_EQ(chunkSize, RCCL_VALUE_INVALID) - << "Uppercase architecture should not match (case sensitive)"; - - INFO(NCCL_LOG_INFO, - "Case insensitive architecture test completed - chunk size: %d", - chunkSize); - - CleanupMockComm(mockComm); -} - -// Add these test cases after the existing rcclSetP2pNetChunkSize tests - -TEST(Rcclwrap, PXN_GFX942_SmallRanks) { - // Check execution order first - if (ShouldSkipPxnTestDueToExecutionOrder("PXN_GFX942_SmallRanks")) { - GTEST_SKIP() << "Skipping due to execution order - another rcclSetPxn test " - "already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipPxnTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_PXN_DISABLE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, "Testing rcclSetPxn for GFX942 with small ranks"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx942", 32); - - int pxnDisable = RCCL_VALUE_UNSET; - rcclSetPxn(mockComm, pxnDisable); - - // Expected: 1 (disabled) for ranks < 64 on GFX942 - EXPECT_EQ(pxnDisable, 1) - << "GFX942 with ranks < 64 should disable PXN (pxnDisable = 1)"; - - INFO(NCCL_LOG_INFO, "GFX942 small ranks PXN test completed - pxnDisable: %d", - pxnDisable); - - CleanupMockComm(mockComm); -} - -TEST(Rcclwrap, PXN_GFX942_LargeRanks) { - // Check execution order first - if (ShouldSkipPxnTestDueToExecutionOrder("PXN_GFX942_LargeRanks")) { - GTEST_SKIP() << "Skipping due to execution order - another rcclSetPxn test " - "already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipPxnTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_PXN_DISABLE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, "Testing rcclSetPxn for GFX942 with large ranks"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx942", 128); - - int pxnDisable = RCCL_VALUE_UNSET; - rcclSetPxn(mockComm, pxnDisable); - - // Expected: 0 (enabled) for ranks >= 64 on GFX942 - EXPECT_EQ(pxnDisable, 0) - << "GFX942 with ranks >= 64 should enable PXN (pxnDisable = 0)"; - - INFO(NCCL_LOG_INFO, "GFX942 large ranks PXN test completed - pxnDisable: %d", - pxnDisable); - - CleanupMockComm(mockComm); -} - -TEST(Rcclwrap, PXN_GFX942_BoundaryRank64) { - // Check execution order first - if (ShouldSkipPxnTestDueToExecutionOrder("PXN_GFX942_BoundaryRank64")) { - GTEST_SKIP() << "Skipping due to execution order - another rcclSetPxn test " - "already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipPxnTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_PXN_DISABLE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, "Testing rcclSetPxn for GFX942 with boundary rank 64"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx942", 64); - - int pxnDisable = RCCL_VALUE_UNSET; - rcclSetPxn(mockComm, pxnDisable); - - // Expected: 0 (enabled) for ranks >= 64 on GFX942 - EXPECT_EQ(pxnDisable, 0) - << "GFX942 with ranks = 64 should enable PXN (pxnDisable = 0)"; - - INFO(NCCL_LOG_INFO, - "GFX942 boundary rank 64 PXN test completed - pxnDisable: %d", - pxnDisable); - - CleanupMockComm(mockComm); -} - -TEST(Rcclwrap, PXN_GFX942_BoundaryRank63) { - // Check execution order first - if (ShouldSkipPxnTestDueToExecutionOrder("PXN_GFX942_BoundaryRank63")) { - GTEST_SKIP() << "Skipping due to execution order - another rcclSetPxn test " - "already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipPxnTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_PXN_DISABLE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, "Testing rcclSetPxn for GFX942 with boundary rank 63"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx942", 63); - - int pxnDisable = RCCL_VALUE_UNSET; - rcclSetPxn(mockComm, pxnDisable); - - // Expected: 1 (disabled) for ranks < 64 on GFX942 - EXPECT_EQ(pxnDisable, 1) - << "GFX942 with ranks = 63 should disable PXN (pxnDisable = 1)"; - - INFO(NCCL_LOG_INFO, - "GFX942 boundary rank 63 PXN test completed - pxnDisable: %d", - pxnDisable); - - CleanupMockComm(mockComm); -} - -TEST(Rcclwrap, PXN_GFX950_SmallRanks) { - // Check execution order first - if (ShouldSkipPxnTestDueToExecutionOrder("PXN_GFX950_SmallRanks")) { - GTEST_SKIP() << "Skipping due to execution order - another rcclSetPxn test " - "already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipPxnTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_PXN_DISABLE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, "Testing rcclSetPxn for GFX950 with small ranks"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx950", 16); - - int pxnDisable = RCCL_VALUE_UNSET; - rcclSetPxn(mockComm, pxnDisable); - - // Expected: 1 (disabled) for ranks < 32 on GFX950 - EXPECT_EQ(pxnDisable, 1) - << "GFX950 with ranks < 32 should disable PXN (pxnDisable = 1)"; - - INFO(NCCL_LOG_INFO, "GFX950 small ranks PXN test completed - pxnDisable: %d", - pxnDisable); - - CleanupMockComm(mockComm); -} - -TEST(Rcclwrap, PXN_GFX950_LargeRanks) { - // Check execution order first - if (ShouldSkipPxnTestDueToExecutionOrder("PXN_GFX950_LargeRanks")) { - GTEST_SKIP() << "Skipping due to execution order - another rcclSetPxn test " - "already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipPxnTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_PXN_DISABLE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, "Testing rcclSetPxn for GFX950 with large ranks"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx950", 64); - - int pxnDisable = RCCL_VALUE_UNSET; - rcclSetPxn(mockComm, pxnDisable); - - // Expected: 0 (enabled) for ranks >= 32 on GFX950 - EXPECT_EQ(pxnDisable, 0) - << "GFX950 with ranks >= 32 should enable PXN (pxnDisable = 0)"; - - INFO(NCCL_LOG_INFO, "GFX950 large ranks PXN test completed - pxnDisable: %d", - pxnDisable); - - CleanupMockComm(mockComm); -} - -TEST(Rcclwrap, PXN_GFX950_BoundaryRank32) { - // Check execution order first - if (ShouldSkipPxnTestDueToExecutionOrder("PXN_GFX950_BoundaryRank32")) { - GTEST_SKIP() << "Skipping due to execution order - another rcclSetPxn test " - "already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipPxnTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_PXN_DISABLE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, "Testing rcclSetPxn for GFX950 with boundary rank 32"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx950", 32); - - int pxnDisable = RCCL_VALUE_UNSET; - rcclSetPxn(mockComm, pxnDisable); - - // Expected: 0 (enabled) for ranks >= 32 on GFX950 - EXPECT_EQ(pxnDisable, 0) - << "GFX950 with ranks = 32 should enable PXN (pxnDisable = 0)"; - - INFO(NCCL_LOG_INFO, - "GFX950 boundary rank 32 PXN test completed - pxnDisable: %d", - pxnDisable); - - CleanupMockComm(mockComm); -} - -TEST(Rcclwrap, PXN_GFX950_BoundaryRank31) { - // Check execution order first - if (ShouldSkipPxnTestDueToExecutionOrder("PXN_GFX950_BoundaryRank31")) { - GTEST_SKIP() << "Skipping due to execution order - another rcclSetPxn test " - "already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipPxnTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_PXN_DISABLE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, "Testing rcclSetPxn for GFX950 with boundary rank 31"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx950", 31); - - int pxnDisable = RCCL_VALUE_UNSET; - rcclSetPxn(mockComm, pxnDisable); - - // Expected: 1 (disabled) for ranks < 32 on GFX950 - EXPECT_EQ(pxnDisable, 1) - << "GFX950 with ranks = 31 should disable PXN (pxnDisable = 1)"; - - INFO(NCCL_LOG_INFO, - "GFX950 boundary rank 31 PXN test completed - pxnDisable: %d", - pxnDisable); - - CleanupMockComm(mockComm); -} - -TEST(Rcclwrap, PXN_UnsupportedArch_GFX908) { - // Check execution order first - if (ShouldSkipPxnTestDueToExecutionOrder("PXN_UnsupportedArch_GFX908")) { - GTEST_SKIP() << "Skipping due to execution order - another rcclSetPxn test " - "already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipPxnTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_PXN_DISABLE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, "Testing rcclSetPxn for unsupported architecture GFX908"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx908", 32); - - int pxnDisable = RCCL_VALUE_UNSET; - rcclSetPxn(mockComm, pxnDisable); - - // Expected: RCCL_VALUE_INVALID for unsupported architectures - EXPECT_EQ(pxnDisable, RCCL_VALUE_INVALID) - << "Unsupported architecture GFX908 should set pxnDisable to " - "RCCL_VALUE_INVALID"; - - INFO(NCCL_LOG_INFO, - "Unsupported architecture GFX908 PXN test completed - pxnDisable: %d", - pxnDisable); - - CleanupMockComm(mockComm); -} - -TEST(Rcclwrap, PXN_UnsupportedArch_GFX90A) { - // Check execution order first - if (ShouldSkipPxnTestDueToExecutionOrder("PXN_UnsupportedArch_GFX90A")) { - GTEST_SKIP() << "Skipping due to execution order - another rcclSetPxn test " - "already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipPxnTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_PXN_DISABLE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, "Testing rcclSetPxn for unsupported architecture GFX90A"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx90a", 32); - - int pxnDisable = RCCL_VALUE_UNSET; - rcclSetPxn(mockComm, pxnDisable); - - // Expected: RCCL_VALUE_INVALID for unsupported architectures - EXPECT_EQ(pxnDisable, RCCL_VALUE_INVALID) - << "Unsupported architecture GFX90A should set pxnDisable to " - "RCCL_VALUE_INVALID"; - - INFO(NCCL_LOG_INFO, - "Unsupported architecture GFX90A PXN test completed - pxnDisable: %d", - pxnDisable); - - CleanupMockComm(mockComm); -} - -// This test specifically tests the environment variable behavior -TEST(Rcclwrap, PXN_WithEnvironmentVariable) { - // Check execution order first - if (ShouldSkipPxnTestDueToExecutionOrder("PXN_WithEnvironmentVariable")) { - GTEST_SKIP() << "Skipping due to execution order - another rcclSetPxn test " - "already ran"; - } - - // This test requires environment variable to be set to a specific value - if (ShouldSkipPxnTest("1")) { - GTEST_SKIP() << "Skipping test: NCCL_PXN_DISABLE environment variable is " - "not set to '1'. " - << "Please set: export NCCL_PXN_DISABLE=1 to run this test. " - << "This test verifies that user override via environment " - "variable works correctly."; - } - - INFO(NCCL_LOG_INFO, "Testing rcclSetPxn with environment variable set"); - - // Environment variable is confirmed to be set to "1" - const char *envVar = getenv("NCCL_PXN_DISABLE"); - INFO(NCCL_LOG_INFO, "Environment variable found: NCCL_PXN_DISABLE=%s", - envVar); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx942", 128); - - int pxnDisable = RCCL_VALUE_UNSET; - rcclSetPxn(mockComm, pxnDisable); - - // Expected: RCCL_VALUE_INVALID when environment variable is set (user - // override) - EXPECT_EQ(pxnDisable, RCCL_VALUE_INVALID) - << "When env var is set, should return RCCL_VALUE_INVALID"; - - INFO(NCCL_LOG_INFO, - "Environment variable PXN test completed - pxnDisable: %d", pxnDisable); - INFO(NCCL_LOG_INFO, "User override via NCCL_PXN_DISABLE=%s was respected", - envVar); - - CleanupMockComm(mockComm); -} - -TEST(Rcclwrap, PXN_ZeroRanks_GFX942) { - // Check execution order first - if (ShouldSkipPxnTestDueToExecutionOrder("PXN_ZeroRanks_GFX942")) { - GTEST_SKIP() << "Skipping due to execution order - another rcclSetPxn test " - "already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipPxnTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_PXN_DISABLE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, "Testing rcclSetPxn with zero ranks for GFX942"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx942", 0); - - int pxnDisable = RCCL_VALUE_UNSET; - rcclSetPxn(mockComm, pxnDisable); - - // Expected: 1 (disabled) since 0 < 64 - EXPECT_EQ(pxnDisable, 1) - << "Zero ranks should be treated as < 64, disabling PXN (pxnDisable = 1)"; - - INFO(NCCL_LOG_INFO, "Zero ranks GFX942 PXN test completed - pxnDisable: %d", - pxnDisable); - - CleanupMockComm(mockComm); -} - -TEST(Rcclwrap, PXN_ZeroRanks_GFX950) { - // Check execution order first - if (ShouldSkipPxnTestDueToExecutionOrder("PXN_ZeroRanks_GFX950")) { - GTEST_SKIP() << "Skipping due to execution order - another rcclSetPxn test " - "already ran"; - } - - // Check if we should skip this test due to environment variable being set - if (ShouldSkipPxnTest()) { - GTEST_SKIP() - << "Skipping test: NCCL_PXN_DISABLE environment variable is set, " - << "which would override the static variable behavior. " - << "This test requires clean environment to test architecture logic."; - } - - INFO(NCCL_LOG_INFO, "Testing rcclSetPxn with zero ranks for GFX950"); - - ncclComm_t mockComm = nullptr; - struct ncclTopoSystem mockTopo; - struct ncclTopoNode mockGpuNode; - CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx950", 0); - - int pxnDisable = RCCL_VALUE_UNSET; - rcclSetPxn(mockComm, pxnDisable); - - // Expected: 1 (disabled) since 0 < 32 - EXPECT_EQ(pxnDisable, 1) - << "Zero ranks should be treated as < 32, disabling PXN (pxnDisable = 1)"; - - INFO(NCCL_LOG_INFO, "Zero ranks GFX950 PXN test completed - pxnDisable: %d", - pxnDisable); - - CleanupMockComm(mockComm); -} - -TEST(Rcclwrap, RcclSetPipelining_Invalid_DType) { - // Skip the test if pipelining has been disabled - // (RCCL_DISABLE_REDUCE_COPY_PIPELINING=1) - if (ShouldSkipRcclSetPipeliningTests()) { - GTEST_SKIP() - << "Skipping test: RCCL_DISABLE_REDUCE_COPY_PIPELINING environment " - "variable is set. Unset this variable to enable pipelining."; - } - - // Skip the test if pipelining has been enabled for all data types - // (RCCL_PIPELINE_ALL_DATA_TYPES=1) - const char *allowAllDTypes = getenv("RCCL_PIPELINE_ALL_DATA_TYPES"); - if (allowAllDTypes && strcmp(allowAllDTypes, "0") != 0) { - GTEST_SKIP() << "Skipping test: RCCL_PIPELINE_ALL_DATA_TYPES environment " - "variable is set. Unset this variable to enable pipelining " - "only for bf16 data type."; - } - - // Pipeline should not be set for non-bf16 datatypes, unless - // rcclParamPipelineAllDTypes() returns true - ncclComm_t comm = nullptr; - struct ncclTopoSystem topo; - struct ncclTopoNode gpu; - CreateMockComm(comm, topo, gpu, "gfx950", 8); - comm->nNodes = 2; // Multi node - - ncclTaskColl info = {}; - info.func = ncclFuncAllReduce; - info.datatype = ncclFloat32; - - size_t nBytes = 16 * 1024 * 1024; // 16MB - rcclSetPipelining(comm, nBytes, &info); - - EXPECT_EQ(info.pipeline, 0) << "Non-bf16 should not set pipeline by default"; - - CleanupMockComm(comm); -} - -TEST(Rcclwrap, RcclSetPipelining_GFX950_MultiNode_Enable) { - // Skip the test if pipelining has been disabled - // (RCCL_DISABLE_REDUCE_COPY_PIPELINING=1) - if (ShouldSkipRcclSetPipeliningTests()) { - GTEST_SKIP() - << "Skipping test: RCCL_DISABLE_REDUCE_COPY_PIPELINING environment " - "variable is set. Unset this variable to enable pipelining."; - } - - // For multi-node, pipeline is set to 1 for AllReduce with bf16 - ncclComm_t comm = nullptr; - struct ncclTopoSystem topo; - struct ncclTopoNode gpu; - CreateMockComm(comm, topo, gpu, "gfx950", 8); - comm->nNodes = 2; // Multi node - - ncclTaskColl info = {}; - // In rcclSetPipelining(), ncclFuncAllReduce, ncclFuncReduceScatter, and - // ncclFuncReduce share the same case body. Testing any one of them is - // sufficient to validate that code path. - info.func = ncclFuncAllReduce; - info.datatype = ncclBfloat16; - - size_t nBytes = 16 * 1024 * 1024; // 16MB - rcclSetPipelining(comm, nBytes, &info); - - EXPECT_EQ(info.pipeline, 1) - << "gfx950 multi-node AllReduce bf16 should enable pipelining"; - - CleanupMockComm(comm); -} - -TEST(Rcclwrap, RcclSetPipelining_GFX950_SingleNode_Disable) { - // Skip the test if pipelining has been disabled - // (RCCL_DISABLE_REDUCE_COPY_PIPELINING=1) - if (ShouldSkipRcclSetPipeliningTests()) { - GTEST_SKIP() - << "Skipping test: RCCL_DISABLE_REDUCE_COPY_PIPELINING environment " - "variable is set. Unset this variable to enable pipelining."; - } - - // For single-node, pipeline remains 0 - ncclComm_t comm = nullptr; - struct ncclTopoSystem topo; - struct ncclTopoNode gpu; - CreateMockComm(comm, topo, gpu, "gfx950", 8); - comm->nNodes = 1; // Single node - - ncclTaskColl info = {}; - // In rcclSetPipelining(), ncclFuncAllReduce, ncclFuncReduceScatter, and - // ncclFuncReduce share the same case body. Testing any one of them is - // sufficient to validate that code path. - info.func = ncclFuncAllReduce; - info.datatype = ncclBfloat16; - - size_t nBytes = 16 * 1024 * 1024; // 16MB - rcclSetPipelining(comm, nBytes, &info); - - EXPECT_EQ(info.pipeline, 0) - << "gfx950 single-node should not enable pipelining"; - - CleanupMockComm(comm); -} - -TEST(Rcclwrap, RcclSetPipelining_GFX942_SingleNode_AllReduce_Enable) { - // Skip the test if pipelining has been disabled - // (RCCL_DISABLE_REDUCE_COPY_PIPELINING=1) - if (ShouldSkipRcclSetPipeliningTests()) { - GTEST_SKIP() - << "Skipping test: RCCL_DISABLE_REDUCE_COPY_PIPELINING environment " - "variable is set. Unset this variable to enable pipelining."; - } - - // For single-node, pipeline is set to 1 for AllReduce with bf16 - ncclComm_t comm = nullptr; - struct ncclTopoSystem topo; - struct ncclTopoNode gpu; - CreateMockComm(comm, topo, gpu, "gfx942", 8); - comm->nNodes = 1; // Single node - - ncclTaskColl info = {}; - info.func = ncclFuncAllReduce; - info.datatype = ncclBfloat16; - - size_t nBytes = 16 * 1024 * 1024; // 16MB - rcclSetPipelining(comm, nBytes, &info); - - EXPECT_EQ(info.pipeline, 1) - << "gfx942 single-node AllReduce bf16 should enable pipelining"; - - CleanupMockComm(comm); -} - -TEST(Rcclwrap, RcclSetPipelining_GFX942_MultiNode_AllReduce_Enable) { - // Skip the test if pipelining has been disabled - // (RCCL_DISABLE_REDUCE_COPY_PIPELINING=1) - if (ShouldSkipRcclSetPipeliningTests()) { - GTEST_SKIP() - << "Skipping test: RCCL_DISABLE_REDUCE_COPY_PIPELINING environment " - "variable is set. Unset this variable to enable pipelining."; - } - - // For multi-node AllReduce with bf16, pipelining is enabled if - // nBytes <= 512MB * 2^(log2(nNodes)-1) - // Testing with nNodes = 4 => threshold = 512MB * 2^(2-1) = 1GB - ncclComm_t comm = nullptr; - struct ncclTopoSystem topo; - struct ncclTopoNode gpu; - CreateMockComm(comm, topo, gpu, "gfx942", 8); - comm->nNodes = 4; - - ncclTaskColl info = {}; - info.func = ncclFuncAllReduce; - info.datatype = ncclBfloat16; - - size_t nBytes = (1ULL << 30); // 1GB, exactly at threshold - rcclSetPipelining(comm, nBytes, &info); - - EXPECT_EQ(info.pipeline, 1) - << "gfx942 4-node AllReduce at threshold should enable pipelining"; - - CleanupMockComm(comm); -} - -TEST(Rcclwrap, RcclSetPipelining_GFX942_MultiNode_AllReduce_Disable) { - // Skip the test if pipelining has been disabled - // (RCCL_DISABLE_REDUCE_COPY_PIPELINING=1) - if (ShouldSkipRcclSetPipeliningTests()) { - GTEST_SKIP() - << "Skipping test: RCCL_DISABLE_REDUCE_COPY_PIPELINING environment " - "variable is set. Unset this variable to enable pipelining."; - } - - // When nBytes is just above the threshold, pipelining should be disabled - ncclComm_t comm = nullptr; - struct ncclTopoSystem topo; - struct ncclTopoNode gpu; - CreateMockComm(comm, topo, gpu, "gfx942", 8); - comm->nNodes = 4; - - ncclTaskColl info = {}; - info.func = ncclFuncAllReduce; - info.datatype = ncclBfloat16; - - size_t nBytes = (1ULL << 30) + 1024; // 1GB + 1KB, just above threshold - rcclSetPipelining(comm, nBytes, &info); - - EXPECT_EQ(info.pipeline, 0) - << "gfx942 4-node AllReduce above threshold should disable pipelining"; - - CleanupMockComm(comm); -} - -TEST(Rcclwrap, RcclSetPipelining_GFX942_Enable) { - // Skip the test if pipelining has been disabled - // (RCCL_DISABLE_REDUCE_COPY_PIPELINING=1) - if (ShouldSkipRcclSetPipeliningTests()) { - GTEST_SKIP() - << "Skipping test: RCCL_DISABLE_REDUCE_COPY_PIPELINING environment " - "variable is set. Unset this variable to enable pipelining."; - } - - // ReduceScatter & Reduce should enable pipelining regardless of no. of nodes - ncclComm_t comm = nullptr; - struct ncclTopoSystem topo; - struct ncclTopoNode gpu; - CreateMockComm(comm, topo, gpu, "gfx942", 8); - comm->nNodes = 8; - - ncclTaskColl info = {}; - // In rcclSetPipelining(), ncclFuncReduceScatter, and - // ncclFuncReduce share the same case body. Testing any one of them is - // sufficient to validate that code path. - info.func = ncclFuncReduceScatter; - info.datatype = ncclBfloat16; - - size_t nBytes = 16 * 1024 * 1024; // 16MB - rcclSetPipelining(comm, nBytes, &info); - - EXPECT_EQ(info.pipeline, 1) - << "gfx942 ReduceScatter and Reduce should enable " - "pipelining with single or multi-node"; - - CleanupMockComm(comm); -} - -TEST(Rcclwrap, RcclOverrideProtocol_NoOverride) { - const char *protoOverrideEnv = getenv("RCCL_OVERRIDE_PROTO"); - // Skip the test if RCCL_OVERRIDE_PROTO is set - if (protoOverrideEnv) { - GTEST_SKIP() << "Skipping test: Variable RCCL_OVERRIDE_PROTO is set. Unset " - "it to run this test."; - } - - float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - ncclTaskColl info = {}; - - ncclResult_t result = rcclOverrideProtocol(ncclProtoStr, table, &info); - - EXPECT_EQ(result, ncclSuccess) - << "Expected ncclSuccess when RCCL_OVERRIDE_PROTO is unset, indicating " - "no override should be applied."; -} - -TEST(Rcclwrap, RcclOverrideProtocol_UnsupportedOverride) { - const char *protoOverrideEnv = getenv("RCCL_OVERRIDE_PROTO"); - // Skip the test if RCCL_OVERRIDE_PROTO is not set or if its set to an invalid - // value - if (!isProtoStrValid(protoOverrideEnv)) { - GTEST_SKIP() - << "Skipping test: Variable RCCL_OVERRIDE_PROTO is not set or " - "set to an invalid value. Set it to a valid protocol value to " - "run this test."; - } - - // Mark all combinations as unsupported for the purpose of this test. - float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - for (int a = 0; a < NCCL_NUM_ALGORITHMS; ++a) - for (int p = 0; p < NCCL_NUM_PROTOCOLS; ++p) - table[a][p] = NCCL_ALGO_PROTO_IGNORE; - - ncclTaskColl info = {}; - info.func = ncclFuncReduceScatter; - info.datatype = ncclBfloat16; - info.algorithm = NCCL_ALGO_RING; // Set any algorithm - - ncclResult_t result = rcclOverrideProtocol(ncclProtoStr, table, &info); - - EXPECT_EQ(result, ncclInternalError) - << "Expected ncclInternalError when the override protocol is valid, but " - "not enabled for the selected algorithm."; -} - -TEST(Rcclwrap, RcclOverrideProtocol_ValidOverride) { - const char *protoOverrideEnv = getenv("RCCL_OVERRIDE_PROTO"); - // Skip the test if RCCL_OVERRIDE_PROTO is not set or if its set to an invalid - // value - if (!isProtoStrValid(protoOverrideEnv)) { - GTEST_SKIP() << "Skipping test: RCCL_OVERRIDE_PROTO is not set or set to " - "an invalid value. Set it to a valid protocol name (e.g., " - "'Simple') to run this test."; - } - - // Get the index of the protocol from the string for later comparison - int protoIndex = NCCL_PROTO_UNDEF; - ncclResult_t idxResult = rcclGetAlgoProtoIndex( - protoOverrideEnv, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoIndex); - ASSERT_EQ(idxResult, ncclSuccess) - << "Failed to get protocol index from string"; - - // Mark all combinations as valid for the purpose of this test. - float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - for (int a = 0; a < NCCL_NUM_ALGORITHMS; ++a) - for (int p = 0; p < NCCL_NUM_PROTOCOLS; ++p) - table[a][p] = 0.0; - - ncclTaskColl info = {}; - info.func = ncclFuncAllReduce; - info.datatype = ncclBfloat16; - info.algorithm = NCCL_ALGO_RING; // Set any algorithm - info.protocol = NCCL_PROTO_UNDEF; - - ncclResult_t result = rcclOverrideProtocol(ncclProtoStr, table, &info); - - EXPECT_EQ(result, ncclSuccess) - << "Expected ncclSuccess when override is applied successfully."; - EXPECT_EQ(info.protocol, protoIndex) << "Protocol index should match the " - "override value from environment."; -} - -TEST(Rcclwrap, RcclOverrideProtocol_ValidOverridePersists) { - const char *protoOverrideEnv = getenv("RCCL_OVERRIDE_PROTO"); - // Skip the test if RCCL_OVERRIDE_PROTO is not set or if its set to an invalid - // value - if (!isProtoStrValid(protoOverrideEnv)) { - GTEST_SKIP() - << "Skipping test: RCCL_OVERRIDE_PROTO is not set or set to an invalid " - "value. Set it to a valid protocol name (e.g., 'Simple') to run " - "this test."; - } - - // Get the index of the protocol from the string for later comparison - int protoIndex = NCCL_PROTO_UNDEF; - ncclResult_t idxResult = rcclGetAlgoProtoIndex( - protoOverrideEnv, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoIndex); - ASSERT_EQ(idxResult, ncclSuccess) - << "Failed to get protocol index from string"; - - // Mark all combinations as valid for the purpose of this test. - float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - for (int a = 0; a < NCCL_NUM_ALGORITHMS; ++a) - for (int p = 0; p < NCCL_NUM_PROTOCOLS; ++p) - table[a][p] = 0.0; - - ncclTaskColl info = {}; - info.func = ncclFuncAllReduce; - info.datatype = ncclFloat16; - info.algorithm = NCCL_ALGO_RING; // Set any algorithm - info.protocol = NCCL_PROTO_UNDEF; - - // First call - ncclResult_t result1 = rcclOverrideProtocol(ncclProtoStr, table, &info); - EXPECT_EQ(result1, ncclSuccess) - << "Expected rcclOverrideProtocol to succeed with valid override"; - EXPECT_EQ(info.protocol, protoIndex) - << "Expected protocol to match override after first call"; - - // Second call - ncclResult_t result2 = rcclOverrideProtocol(ncclProtoStr, table, &info); - EXPECT_EQ(result2, ncclSuccess) - << "Expected rcclOverrideProtocol to succeed again on second call"; - EXPECT_EQ(info.protocol, protoIndex) - << "Expected protocol to match override after second call"; -} - -TEST(Rcclwrap, RcclOverrideProtocol_InvalidProtocol) { - const char *protoOverrideEnv = getenv("RCCL_OVERRIDE_PROTO"); - // Skip the test if RCCL_OVERRIDE_PROTO is not set or if its set to a valid - // value - if (!protoOverrideEnv || isProtoStrValid(protoOverrideEnv)) { - GTEST_SKIP() - << "Skipping test: Variable RCCL_OVERRIDE_PROTO is not set or set to a " - "valid value. Set it to an invalid protocol value to run this test."; - } - - float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - ncclTaskColl info = {}; - - ncclResult_t result = rcclOverrideProtocol(ncclProtoStr, table, &info); - - EXPECT_EQ(result, ncclInvalidUsage) << "Expected ncclInvalidUsage when the " - "override protocol is invalid."; -} - -TEST(Rcclwrap, RcclOverrideProtocol_InvalidOverridePersists) { - const char *protoOverrideEnv = getenv("RCCL_OVERRIDE_PROTO"); - if (!protoOverrideEnv || isProtoStrValid(protoOverrideEnv)) { - GTEST_SKIP() - << "Skipping test: Variable RCCL_OVERRIDE_PROTO is not set or set to a " - "valid value. Set it to an invalid protocol value to run this test."; - } - - float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - ncclTaskColl info = {}; - - // First call should fail due to invalid proto string - ncclResult_t result1 = rcclOverrideProtocol(ncclProtoStr, table, &info); - EXPECT_EQ(result1, ncclInvalidUsage) - << "Expected rcclOverrideProtocol to fail with invalid " - "RCCL_OVERRIDE_PROTO."; - - // Second call should still fail because the static variable disables further - // overrides - ncclResult_t result2 = rcclOverrideProtocol(ncclProtoStr, table, &info); - EXPECT_EQ(result2, ncclInvalidUsage) - << "Expected rcclOverrideProtocol to continue returning failure after " - "invalid proto was set."; -} - -TEST(Rcclwrap, RcclOverrideAlgorithm_NoOverride) { - const char *algoOverrideEnv = getenv("RCCL_OVERRIDE_ALGO"); - // Skip the test if RCCL_OVERRIDE_ALGO is set - if (algoOverrideEnv) { - GTEST_SKIP() << "Skipping test: Variable RCCL_OVERRIDE_ALGO is set. Unset " - "it to run this test."; - } - - float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - ncclTaskColl info = {}; - - ncclResult_t result = rcclOverrideAlgorithm(ncclAlgoStr, table, &info); - - // Since no override is set, it should return success and do nothing - EXPECT_EQ(result, ncclSuccess) - << "Expected ncclSuccess when RCCL_OVERRIDE_ALGO is unset, indicating no " - "override should be applied."; -} - -TEST(Rcclwrap, RcclOverrideAlgorithm_UnsupportedOverride) { - const char *algoOverrideEnv = getenv("RCCL_OVERRIDE_ALGO"); - // Skip the test if RCCL_OVERRIDE_ALGO is not set or if its set to an invalid - // value - if (!isAlgoStrValid(algoOverrideEnv)) { - GTEST_SKIP() << "Skipping test: RCCL_OVERRIDE_ALGO is not set or " - "set to an invalid value. Set it to a valid algorithm to " - "run this test."; - } - - float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - for (int a = 0; a < NCCL_NUM_ALGORITHMS; ++a) - for (int p = 0; p < NCCL_NUM_PROTOCOLS; ++p) - table[a][p] = NCCL_ALGO_PROTO_IGNORE; - - ncclTaskColl info = {}; - info.func = ncclFuncReduceScatter; - info.datatype = ncclBfloat16; - info.protocol = NCCL_PROTO_SIMPLE; // Set any protocol - - ncclResult_t result = rcclOverrideAlgorithm(ncclAlgoStr, table, &info); - - EXPECT_EQ(result, ncclInternalError) - << "Expected ncclInternalError when the override algorithm is valid, but " - "not enabled for the selected protocol."; -} - -TEST(Rcclwrap, RcclOverrideAlgorithm_ValidOverride) { - const char *algoOverrideEnv = getenv("RCCL_OVERRIDE_ALGO"); - // Skip the test if RCCL_OVERRIDE_ALGO is not set or if its set to an invalid - // value - if (!isAlgoStrValid(algoOverrideEnv)) { - GTEST_SKIP() << "Skipping test: RCCL_OVERRIDE_ALGO is not set or set to " - "an invalid value. Set it to a valid algorithm name (e.g., " - "'Ring') to run this test."; - } - - // Get the index of the algorithm from the string for later comparison - int algoIndex = NCCL_ALGO_UNDEF; - ncclResult_t idxResult = rcclGetAlgoProtoIndex( - algoOverrideEnv, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoIndex); - ASSERT_EQ(idxResult, ncclSuccess) - << "Failed to get algorithm index from string"; - - float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - // Mark all combinations as valid for the purpose of this test. - for (int a = 0; a < NCCL_NUM_ALGORITHMS; ++a) - for (int p = 0; p < NCCL_NUM_PROTOCOLS; ++p) - table[a][p] = 0.0; - - ncclTaskColl info = {}; - info.func = ncclFuncAllReduce; - info.datatype = ncclBfloat16; - info.protocol = NCCL_PROTO_SIMPLE; // Set any protocol - info.algorithm = NCCL_ALGO_UNDEF; - - ncclResult_t result = rcclOverrideAlgorithm(ncclAlgoStr, table, &info); - - EXPECT_EQ(result, ncclSuccess) - << "Expected ncclSuccess when override is applied successfully."; - EXPECT_EQ(info.algorithm, algoIndex) - << "Algorithm index should match the override value from environment."; -} - -TEST(Rcclwrap, RcclOverrideAlgorithm_ValidOverridePersists) { - const char *algoOverrideEnv = getenv("RCCL_OVERRIDE_ALGO"); - // Skip the test if RCCL_OVERRIDE_ALGO is not set or if its set to an invalid - // value - if (!isAlgoStrValid(algoOverrideEnv)) { - GTEST_SKIP() - << "Skipping test: RCCL_OVERRIDE_ALGO is not set or set to an invalid " - "value. Set it to a valid algorithm name (e.g., 'Ring') to run this " - "test."; - } - - // Get the index of the algorithm from the string for later comparison - int algoIndex = NCCL_ALGO_UNDEF; - ncclResult_t idxResult = rcclGetAlgoProtoIndex( - algoOverrideEnv, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoIndex); - ASSERT_EQ(idxResult, ncclSuccess) - << "Failed to get algorithm index from string"; - - // Mark all combinations as valid for the purpose of this test. - float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - for (int a = 0; a < NCCL_NUM_ALGORITHMS; ++a) - for (int p = 0; p < NCCL_NUM_PROTOCOLS; ++p) - table[a][p] = 0.0; - - ncclTaskColl info = {}; - info.func = ncclFuncAllReduce; - info.datatype = ncclFloat16; - info.protocol = NCCL_PROTO_SIMPLE; // Set any protocol - info.algorithm = NCCL_ALGO_UNDEF; - - // First call - ncclResult_t result1 = rcclOverrideAlgorithm(ncclAlgoStr, table, &info); - EXPECT_EQ(result1, ncclSuccess) - << "Expected rcclOverrideAlgorithm to succeed with valid override."; - EXPECT_EQ(info.algorithm, algoIndex) - << "Expected algorithm to match override after first call."; - - // Second call - ncclResult_t result2 = rcclOverrideAlgorithm(ncclAlgoStr, table, &info); - EXPECT_EQ(result2, ncclSuccess) - << "Expected rcclOverrideAlgorithm to succeed again on second call."; - EXPECT_EQ(info.algorithm, algoIndex) - << "Expected algorithm to match override after second call."; -} - -TEST(Rcclwrap, RcclOverrideAlgorithm_InvalidAlgorithm) { - const char *algoOverrideEnv = getenv("RCCL_OVERRIDE_ALGO"); - // Skip the test if RCCL_OVERRIDE_ALGO is not set or if its set to a valid - // value - if (!algoOverrideEnv || isAlgoStrValid(algoOverrideEnv)) { - GTEST_SKIP() << "Skipping test: RCCL_OVERRIDE_ALGO is not set or set to a " - "valid value. Set it to an invalid algorithm value to run " - "this test."; - } - - float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - ncclTaskColl info = {}; - - ncclResult_t result = rcclOverrideAlgorithm(ncclAlgoStr, table, &info); - - EXPECT_EQ(result, ncclInvalidUsage) - << "Expected ncclInvalidUsage when the override algorithm is invalid."; -} - -TEST(Rcclwrap, RcclOverrideAlgorithm_InvalidOverridePersists) { - const char *algoOverrideEnv = getenv("RCCL_OVERRIDE_ALGO"); - // Skip the test if RCCL_OVERRIDE_ALGO is not set or if its set to a valid - // value - if (!algoOverrideEnv || isAlgoStrValid(algoOverrideEnv)) { - GTEST_SKIP() - << "Skipping test: RCCL_OVERRIDE_ALGO is not set or set to a valid " - "value. Set it to an invalid algorithm name to run this test."; - } - - float table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - ncclTaskColl info = {}; - - // First call should fail due to invalid algo string (and set the static flag) - ncclResult_t result1 = rcclOverrideAlgorithm(ncclAlgoStr, table, &info); - EXPECT_EQ(result1, ncclInvalidUsage) - << "Expected rcclOverrideAlgorithm to fail with invalid " - "RCCL_OVERRIDE_ALGO."; - - // Second call should also fail due to static validInput=false - ncclResult_t result2 = rcclOverrideAlgorithm(ncclAlgoStr, table, &info); - EXPECT_EQ(result2, ncclInvalidUsage) - << "Expected rcclOverrideAlgorithm to continue returning failure after " - "invalid algo was set."; +TEST(Rcclwrap, AllPxnTests) +{ + // Define test case structure + struct PxnTestCase + { + std::string name; + std::string arch; + int ranks; + int expectedPxnDisable; + std::unordered_map extraEnv; + bool shouldSkipCheck; // For tests with environment variable set + }; + + // Define all test cases + std::vector testCases = { + // GFX942 tests + { "PXN_GFX942_SmallRanks_Isolated","gfx942", 32, 1, {},true }, + { "PXN_GFX942_LargeRanks_Isolated", "gfx942", 128, 0, {}, true}, + { "PXN_GFX942_BoundaryRank64_Isolated", "gfx942", 64, 0, {}, true}, + { "PXN_GFX942_BoundaryRank63_Isolated", "gfx942", 63, 1, {}, true}, + + // GFX950 tests + { "PXN_GFX950_SmallRanks_Isolated", "gfx950", 8, 1, {}, true}, + { "PXN_GFX950_LargeRanks_Isolated", "gfx950", 64, 0, {}, true}, + { "PXN_GFX950_BoundaryRank32_Isolated", "gfx950", 32, 0, {}, true}, + { "PXN_GFX950_BoundaryRank31_Isolated", "gfx950", 31, 1, {}, true}, + + // Unsupported architecture + { "PXN_UnsupportedArch_GFX908_Isolated", "gfx908", 32, RCCL_VALUE_INVALID, {}, true}, + + // Environment variable test (no skip check needed) + {"PXN_WithEnvironmentVariable_Isolated", + "gfx942", 32, + RCCL_VALUE_INVALID, {{"NCCL_PXN_DISABLE", "1"}}, + false } + }; + + // Base environment for all tests + std::unordered_map baseEnv = { + { "NCCL_DEBUG", "TRACE"}, + {"NCCL_DEBUG_SUBSYS", "ALL"} + }; + + // Register all tests using a loop + for(const auto& tc : testCases) + { + ProcessIsolatedTestRunner::registerTest( + ProcessIsolatedTestRunner::TestConfig( + tc.name, + [tc]() + { + // Check if we should skip this test due to environment variable being + // set + if(tc.shouldSkipCheck && ShouldSkipPxnTest()) + { + GTEST_SKIP() + << "Skipping " << tc.name << " due to environment variable being set"; + return; + } + + INFO( + NCCL_LOG_INFO, + "Testing rcclSetPxn for %s with %d ranks", + tc.arch.c_str(), + tc.ranks + ); + + ncclComm_t mockComm = nullptr; + struct ncclTopoSystem mockTopo; + struct ncclTopoNode mockGpuNode; + CreateMockComm(mockComm, mockTopo, mockGpuNode, tc.arch.c_str(), tc.ranks); + + int pxnDisable = RCCL_VALUE_UNSET; + rcclSetPxn(mockComm, pxnDisable); + + EXPECT_EQ(pxnDisable, tc.expectedPxnDisable) + << "Failed for " << tc.arch << " with " << tc.ranks << " ranks"; + + INFO( + NCCL_LOG_INFO, + "%s test completed - pxnDisable: %d", + tc.name.c_str(), + pxnDisable + ); + CleanupMockComm(mockComm); + } + ) + .withEnvironment( + [&tc, &baseEnv]() + { + auto env = baseEnv; + env.insert(tc.extraEnv.begin(), tc.extraEnv.end()); + return env; + }() + ) + ); + } + + // Configure execution options for sequential execution with stop on first + // failure + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = true; + options.verboseLogging = true; + + // Execute all registered tests + bool allTestsPassed = ProcessIsolatedTestRunner::executeAllTests(options); + + EXPECT_TRUE(allTestsPassed) << "One or more PXN process-isolated tests failed"; } } // namespace RcclUnitTesting diff --git a/projects/rccl/test/common/ProcessIsolatedTestRunner.cpp b/projects/rccl/test/common/ProcessIsolatedTestRunner.cpp new file mode 100644 index 0000000000..beb3853c11 --- /dev/null +++ b/projects/rccl/test/common/ProcessIsolatedTestRunner.cpp @@ -0,0 +1,696 @@ +/************************************************************************* + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "ProcessIsolatedTestRunner.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "ErrCode.hpp" + +namespace RcclUnitTesting +{ + +// Exit codes for test process results +enum RcclTestCode +{ + RCCL_TEST_INVALID = -1, + RCCL_TEST_SUCCESS = 0, + RCCL_TEST_FAILURE = 1, + RCCL_TEST_UNKNOWN_EXCEPTION = 2, + RCCL_TEST_TIMEOUT = 3, + RCCL_TEST_SKIPPED = 4 +}; + +// Define static members +std::mutex ProcessIsolatedTestRunner::testConfigsMutex_; +std::vector ProcessIsolatedTestRunner::testConfigs_; +std::mutex ProcessIsolatedTestRunner::resultsMutex_; +std::vector ProcessIsolatedTestRunner::testResults_; + +// TestResult implementation +ProcessIsolatedTestRunner::TestResult::TestResult() + : passed(false), skipped(false), exitCode(-1), processId(-1), duration(0) +{} + +// TestConfig implementation +ProcessIsolatedTestRunner::TestConfig::TestConfig( + const std::string& testName, std::function logic +) + : name(testName), testLogic(logic), timeout(30), inheritParentEnv(true) +{} + +ProcessIsolatedTestRunner::TestConfig& ProcessIsolatedTestRunner::TestConfig::withEnvironment( + const std::unordered_map& env +) +{ + environmentVariables = env; + return *this; +} + +ProcessIsolatedTestRunner::TestConfig& + ProcessIsolatedTestRunner::TestConfig::withTimeout(std::chrono::seconds timeoutSeconds) +{ + timeout = timeoutSeconds; + return *this; +} + +ProcessIsolatedTestRunner::TestConfig& + ProcessIsolatedTestRunner::TestConfig::withCleanEnvironment(bool inherit) +{ + inheritParentEnv = inherit; + return *this; +} + +ProcessIsolatedTestRunner::TestConfig& + ProcessIsolatedTestRunner::TestConfig::clearVariable(const std::string& varName) +{ + clearEnvVars.push_back(varName); + return *this; +} + +ProcessIsolatedTestRunner::TestConfig& ProcessIsolatedTestRunner::TestConfig::setVariable( + const std::string& name, const std::string& value +) +{ + environmentVariables[name] = value; + return *this; +} + +// ExecutionOptions implementation +ProcessIsolatedTestRunner::ExecutionOptions::ExecutionOptions() + : stopOnFirstFailure(false), verboseLogging(true) +{} + +// Apply environment variables to current process +void ProcessIsolatedTestRunner::applyEnvironmentVariables(const TestConfig& config) +{ + // Clear specified environment variables first + for(const auto& varName : config.clearEnvVars) + { + unsetenv(varName.c_str()); + } + + // If not inheriting parent environment, clear all environment variables + if(!config.inheritParentEnv) + { + // Clear all existing environment variables + if(clearenv() != 0) + { + std::cerr << "Warning: Failed to clear environment variables" << std::endl; + } + + // Set only the specified variables + for(const auto& [name, value] : config.environmentVariables) + { + setenv(name.c_str(), value.c_str(), 1); + } + } + else + { + // Just set/override the specified variables + for(const auto& [name, value] : config.environmentVariables) + { + setenv(name.c_str(), value.c_str(), 1); + } + } +} + +// Execute a single test in a separate process +int ProcessIsolatedTestRunner::runTestInProcess(const TestConfig& config) +{ + pid_t processId = getpid(); + + if(config.name.empty()) + { + std::cerr << "Error: Test name is empty for process " << processId << std::endl; + return RCCL_TEST_FAILURE; + } + + try + { + // Apply environment variables + applyEnvironmentVariables(config); + + // Thread-safe test execution with timeout protection + std::atomic testCompleted{false}; + std::exception_ptr testException = nullptr; + bool testPassed = true; + bool testSkipped = false; + + // Run test in a separate thread to allow timeout handling + std::thread testThread( + [&]() + { + try + { + // Get initial test state + const ::testing::UnitTest* unitTest = ::testing::UnitTest::GetInstance(); + size_t initialFailureCount = unitTest->failed_test_count(); + size_t initialSkippedCount = unitTest->skipped_test_count(); + + // Execute the test logic + config.testLogic(); + + // Check if any new test failures occurred + size_t finalFailureCount = unitTest->failed_test_count(); + size_t finalSkippedCount = unitTest->skipped_test_count(); + + testPassed = (finalFailureCount == initialFailureCount); + testSkipped = (finalSkippedCount > initialSkippedCount); + + testCompleted = true; + } + catch(...) + { + testException = std::current_exception(); + testPassed = false; + testCompleted = true; + } + } + ); + + // Wait for test completion with timeout + auto start = std::chrono::steady_clock::now(); + const auto timeout = config.timeout; + + while(!testCompleted.load()) + { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + if(std::chrono::steady_clock::now() - start > timeout) + { + // Test timed out + INFO( + "Test '%s' TIMED OUT after %ld seconds\n", + config.name.c_str(), + timeout.count() + ); + fflush(NULL); + testThread.detach(); + return RCCL_TEST_TIMEOUT; + } + } + + // Wait for thread completion + if(testThread.joinable()) + { + testThread.join(); + } + + // Check if test threw an exception + if(testException) + { + std::rethrow_exception(testException); + } + + // Flush output before returning (needed before _exit()) + fflush(NULL); + + // Return appropriate exit code based on test result + if(testSkipped) + { + return RCCL_TEST_SKIPPED; + } + else if(testPassed) + { + return RCCL_TEST_SUCCESS; + } + else + { + return RCCL_TEST_FAILURE; + } + } + catch(const std::exception& e) + { + INFO("Test '%s' FAILED with exception: %s\n", config.name.c_str(), e.what()); + std::cerr << "Exception in test '" << config.name << "': " << e.what() << std::endl; + fflush(NULL); + return RCCL_TEST_FAILURE; + } + catch(...) + { + INFO("Test '%s' FAILED with unknown exception\n", config.name.c_str()); + std::cerr << "Unknown exception in test '" << config.name << "'" << std::endl; + fflush(NULL); + return RCCL_TEST_UNKNOWN_EXCEPTION; + } +} + +// Register a test configuration +void ProcessIsolatedTestRunner::registerTest(const TestConfig& config) +{ + std::lock_guard lock(testConfigsMutex_); + testConfigs_.push_back(config); +} + +// Register a simple test with just name and logic +void ProcessIsolatedTestRunner::registerTest( + const std::string& name, std::function testLogic +) +{ + registerTest(TestConfig(name, testLogic)); +} + +// Register a test with environment variables +void ProcessIsolatedTestRunner::registerTest( + const std::string& name, + std::function testLogic, + const std::unordered_map& env +) +{ + registerTest(TestConfig(name, testLogic).withEnvironment(env)); +} + +// Record test result (thread-safe) +void ProcessIsolatedTestRunner::recordTestResult(const TestResult& result) +{ + std::lock_guard lock(resultsMutex_); + testResults_.push_back(result); +} + +// Helper method: Create pipes for capturing process output +bool ProcessIsolatedTestRunner::createOutputPipes(int stdoutPipe[2], int stderrPipe[2]) +{ + // Create pipes for stdout and stderr + // stdoutPipe[0] = read end, stdoutPipe[1] = write end + if(pipe(stdoutPipe) == -1) + { + std::cerr << "Failed to create stdout pipe: " << strerror(errno) << std::endl; + return false; + } + + if(pipe(stderrPipe) == -1) + { + std::cerr << "Failed to create stderr pipe: " << strerror(errno) << std::endl; + close(stdoutPipe[0]); + close(stdoutPipe[1]); + return false; + } + + return true; +} + +// Helper method: Redirect child process output to pipes +void ProcessIsolatedTestRunner::redirectOutputToPipes(int stdoutPipe[2], int stderrPipe[2]) +{ + // Close read ends of pipes in child process (not needed) + close(stdoutPipe[0]); + close(stderrPipe[0]); + + // Redirect stdout and stderr to write ends of pipes + dup2(stdoutPipe[1], STDOUT_FILENO); + dup2(stderrPipe[1], STDERR_FILENO); + + // Close the original write end file descriptors after duplication + // The duplicated descriptors (STDOUT_FILENO, STDERR_FILENO) will be closed by _exit() + close(stdoutPipe[1]); + close(stderrPipe[1]); +} + +// Helper method: Capture output from child process pipes +ProcessIsolatedTestRunner::CapturedOutput ProcessIsolatedTestRunner::captureProcessOutput( + int stdoutPipe[2], int stderrPipe[2], pid_t pid, int* status +) +{ + // Close write ends of pipes in parent process (not needed) + close(stdoutPipe[1]); + close(stderrPipe[1]); + + CapturedOutput output; + char buffer[4096]; + ssize_t count; + + // Read from stdout pipe + while((count = read(stdoutPipe[0], buffer, sizeof(buffer) - 1)) > 0) + { + buffer[count] = '\0'; + output.stdoutContent += buffer; + } + close(stdoutPipe[0]); + + // Read from stderr pipe + while((count = read(stderrPipe[0], buffer, sizeof(buffer) - 1)) > 0) + { + buffer[count] = '\0'; + output.stderrContent += buffer; + } + close(stderrPipe[0]); + + // Wait for child to exit (blocking) + waitpid(pid, status, 0); + + return output; +} + +// Helper method: Display captured output +void ProcessIsolatedTestRunner::displayCapturedOutput( + const CapturedOutput& output, const std::string& testName +) +{ + if(!output.stdoutContent.empty()) + { + std::cout << output.stdoutContent; + if(output.stdoutContent.back() != '\n') + std::cout << '\n'; + } + if(!output.stderrContent.empty()) + { + std::cerr << output.stderrContent; + if(output.stderrContent.back() != '\n') + std::cerr << '\n'; + } +} + +// Execute all registered tests (simplified sequential execution only) +bool ProcessIsolatedTestRunner::executeAllTests(const ExecutionOptions& options) +{ + + // Get test configurations to run + std::vector testsToRun; + { + std::lock_guard lock(testConfigsMutex_); + testsToRun = testConfigs_; + } + + // Clear previous results + { + std::lock_guard lock(resultsMutex_); + testResults_.clear(); + } + + // Sequential execution + for(const auto& testConfig : testsToRun) + { + auto startTime = std::chrono::steady_clock::now(); + + int stdout_fd[2], stderr_fd[2]; + if(!createOutputPipes(stdout_fd, stderr_fd)) + { + std::cerr << "Failed to create output files for test '" << testConfig.name << "'" + << std::endl; + continue; + } + + pid_t pid = fork(); + + if(pid == 0) + { + redirectOutputToPipes(stdout_fd, stderr_fd); + int result = runTestInProcess(testConfig); + // Use _exit() instead of exit() to avoid atexit handlers + // This prevents GPU runtime cleanup issues after fork + _exit(result); + } + else if(pid > 0) + { + // Log test start with environment variables if any + if(!testConfig.environmentVariables.empty()) + { + std::string envVars; + for(const auto& [name, value] : testConfig.environmentVariables) + { + if(!envVars.empty()) + envVars += ", "; + envVars += name + "=" + value; + } + INFO( + "Running isolated test '%s' (PID: %d) with env: %s\n", + testConfig.name.c_str(), + pid, + envVars.c_str() + ); + } + else + { + INFO("Running isolated test '%s' (PID: %d)\n", testConfig.name.c_str(), pid); + } + int status; + CapturedOutput output = captureProcessOutput(stdout_fd, stderr_fd, pid, &status); + + auto endTime = std::chrono::steady_clock::now(); + auto duration + = std::chrono::duration_cast(endTime - startTime); + + TestResult testResult; + testResult.testName = testConfig.name; + testResult.processId = pid; + testResult.duration = duration; + + if(WIFEXITED(status)) + { + int exitCode = WEXITSTATUS(status); + testResult.exitCode = exitCode; + testResult.passed = (exitCode == RCCL_TEST_SUCCESS); + testResult.skipped = (exitCode == RCCL_TEST_SKIPPED); + + if(exitCode == RCCL_TEST_SUCCESS) + { + INFO("Test '%s' PASSED (%ld ms)\n", testConfig.name.c_str(), duration.count()); + } + else if(exitCode == RCCL_TEST_TIMEOUT) + { + INFO( + "Test '%s' (PID: %d) TIMED OUT after %ld ms\n", + testConfig.name.c_str(), + pid, + duration.count() + ); + testResult.errorMessage = "Test timed out"; + } + else if(exitCode == RCCL_TEST_SKIPPED) + { + INFO( + "Test '%s' (PID: %d) SKIPPED in %ld ms\n", + testConfig.name.c_str(), + pid, + duration.count() + ); + testResult.errorMessage = "Test skipped"; + } + else + { + INFO( + "Test '%s' (PID: %d) FAILED with exit code %d after %ld ms\n", + testConfig.name.c_str(), + pid, + exitCode, + duration.count() + ); + testResult.errorMessage + = "Test failed with exit code " + std::to_string(exitCode); + } + } + else if(WIFSIGNALED(status)) + { + int signal = WTERMSIG(status); + + // Check if test reported success before signal termination + bool testPassed = (output.stdoutContent.find("PASSED") != std::string::npos); + + if(testPassed) + { + // Test completed successfully before signal (e.g., GPU runtime cleanup) + testResult.passed = true; + testResult.skipped = false; + testResult.exitCode = RCCL_TEST_SUCCESS; + INFO("Test '%s' PASSED (%ld ms)\n", testConfig.name.c_str(), duration.count()); + } + else + { + // Test terminated by signal before completion (crash) + testResult.passed = false; + testResult.skipped = false; + testResult.exitCode = -signal; + testResult.errorMessage = "Terminated by signal " + std::to_string(signal); + INFO( + "Test '%s' (PID: %d) terminated by signal %d after %ld ms\n", + testConfig.name.c_str(), + pid, + signal, + duration.count() + ); + } + } + else + { + testResult.passed = false; + testResult.skipped = false; + testResult.exitCode = RCCL_TEST_INVALID; + testResult.errorMessage = "Failed to wait for process"; + } + + displayCapturedOutput(output, testConfig.name); + + recordTestResult(testResult); + + // Stop on first failure if requested + if(options.stopOnFirstFailure && !testResult.passed && !testResult.skipped) + { + break; + } + } + else + { + // Fork failed + TestResult testResult; + testResult.testName = testConfig.name; + testResult.passed = false; + testResult.skipped = false; + testResult.exitCode = RCCL_TEST_INVALID; + testResult.processId = RCCL_TEST_INVALID; + testResult.duration = std::chrono::milliseconds(0); + testResult.errorMessage = "Failed to fork process"; + + recordTestResult(testResult); + INFO("Failed to fork process for test '%s'\n", testConfig.name.c_str()); + + if(options.stopOnFirstFailure) + { + break; + } + } + } + + bool result = generateReport(options); + + // Automatically clear test configurations and results after execution + // This ensures a clean state for the next test suite without requiring + // explicit clear() calls from test cases + { + std::lock_guard lock(testConfigsMutex_); + testConfigs_.clear(); + } + { + std::lock_guard lock(resultsMutex_); + testResults_.clear(); + } + + return result; +} + +// Generate and display test report +bool ProcessIsolatedTestRunner::generateReport(const ExecutionOptions& options) +{ + int totalTests = 0; + int passedTests = 0; + int failedTests = 0; + int skippedTests = 0; + std::chrono::milliseconds totalDuration{0}; + + { + std::lock_guard lock(resultsMutex_); + totalTests = testResults_.size(); + + for(const auto& result : testResults_) + { + if(result.skipped) + { + skippedTests++; + } + else if(result.passed) + { + passedTests++; + } + else + { + failedTests++; + } + totalDuration += result.duration; + } + } + + // Report summary only if there are failures or multiple tests + if(failedTests > 0 || totalTests > 1) + { + INFO( + "Process-Isolated Tests: %d passed, %d failed, %d skipped (%ld ms total)\n", + passedTests, + failedTests, + skippedTests, + totalDuration.count() + ); + + if(failedTests > 0) + { + std::lock_guard lock(resultsMutex_); + for(const auto& result : testResults_) + { + if(!result.passed && !result.skipped) + { + INFO( + " Failed: %s - %s\n", + result.testName.c_str(), + result.errorMessage.c_str() + ); + } + } + } + } + + return failedTests == 0; +} + +// Get detailed test results (thread-safe) +std::vector ProcessIsolatedTestRunner::getTestResults() +{ + std::lock_guard lock(resultsMutex_); + return testResults_; +} + +// Clear test registry and results (thread-safe) +void ProcessIsolatedTestRunner::clear() +{ + size_t registeredCount = 0; + size_t executedCount = 0; + + // Check for unexecuted tests before clearing + { + std::lock_guard lock(testConfigsMutex_); + registeredCount = testConfigs_.size(); + } + { + std::lock_guard lock(resultsMutex_); + executedCount = testResults_.size(); + } + + // Warn if tests were registered but not all executed + if(registeredCount > 0 && executedCount < registeredCount) + { + std::cerr << "\n⚠️ WARNING: ProcessIsolatedTestRunner::clear() called with " + << (registeredCount - executedCount) << " unexecuted test(s)!\n" + << " Registered: " << registeredCount << " test(s)\n" + << " Executed: " << executedCount << " test(s)\n" + << " Did you forget to call executeAllTests()?\n" + << std::endl; + } + + // Clear the registrations and results + { + std::lock_guard lock(testConfigsMutex_); + testConfigs_.clear(); + } + { + std::lock_guard lock(resultsMutex_); + testResults_.clear(); + } +} + +// Get number of registered tests +size_t ProcessIsolatedTestRunner::getTestCount() +{ + std::lock_guard lock(testConfigsMutex_); + return testConfigs_.size(); +} + +} // namespace RcclUnitTesting diff --git a/projects/rccl/test/common/ProcessIsolatedTestRunner.hpp b/projects/rccl/test/common/ProcessIsolatedTestRunner.hpp new file mode 100644 index 0000000000..aaed55f910 --- /dev/null +++ b/projects/rccl/test/common/ProcessIsolatedTestRunner.hpp @@ -0,0 +1,365 @@ +/************************************************************************* + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace RcclUnitTesting +{ + +/** + * @brief Generic thread-safe process isolated test runner + * + * This class provides a framework for running tests in isolated processes + * with clean environment settings and sequential execution. + * + */ +class ProcessIsolatedTestRunner +{ +public: + /** + * @brief Test execution result structure + */ + struct TestResult + { + std::string testName; ///< Name of the test + bool passed; ///< Whether the test passed + bool skipped; ///< Whether the test skipped + int exitCode; ///< Process exit code + pid_t processId; ///< Process ID that ran the test + std::chrono::milliseconds duration; ///< Test execution duration + std::string errorMessage; ///< Error message if test failed + std::unordered_map environment; ///< Environment variables used + + /** + * @brief Default constructor + */ + TestResult(); + }; + + /** + * @brief Test configuration structure + */ + struct TestConfig + { + std::string name; ///< Test name + std::function testLogic; ///< Test function to execute + std::unordered_map + environmentVariables; ///< Environment variables to set + std::chrono::seconds timeout; ///< Test timeout + bool inheritParentEnv; ///< Whether to inherit parent environment + std::vector clearEnvVars; ///< Environment variables to explicitly clear + + /** + * @brief Constructor + * @param testName Name of the test + * @param logic Test function to execute + */ + TestConfig(const std::string& testName, std::function logic); + + /** + * @brief Set environment variables for this test + * @param env Map of environment variable name-value pairs + * @return Reference to this TestConfig for method chaining + */ + TestConfig& withEnvironment(const std::unordered_map& env); + + /** + * @brief Set timeout for this test + * @param timeoutSeconds Timeout in seconds + * @return Reference to this TestConfig for method chaining + */ + TestConfig& withTimeout(std::chrono::seconds timeoutSeconds); + + /** + * @brief Configure environment inheritance + * @param inherit Whether to inherit parent environment variables + * @return Reference to this TestConfig for method chaining + */ + TestConfig& withCleanEnvironment(bool inherit = false); + + /** + * @brief Clear a specific environment variable + * @param varName Name of the variable to clear + * @return Reference to this TestConfig for method chaining + */ + TestConfig& clearVariable(const std::string& varName); + + /** + * @brief Set a specific environment variable + * @param name Variable name + * @param value Variable value + * @return Reference to this TestConfig for method chaining + */ + TestConfig& setVariable(const std::string& name, const std::string& value); + }; + + /** + * @brief Execution options for test runner + */ + struct ExecutionOptions + { + bool stopOnFirstFailure; ///< Stop execution on first test failure + bool verboseLogging; ///< Enable verbose logging + + /** + * @brief Default constructor with sensible defaults + */ + ExecutionOptions(); + }; + +private: + /** + * @brief Structure to hold captured process output + */ + struct CapturedOutput + { + std::string stdoutContent; ///< Captured stdout content + std::string stderrContent; ///< Captured stderr content + }; + + // Thread-safe static members for test management + static std::mutex testConfigsMutex_; + static std::vector testConfigs_; + static std::mutex resultsMutex_; + static std::vector testResults_; + + /** + * @brief Apply environment variables to current process + * @param config Test configuration containing environment settings + */ + static void applyEnvironmentVariables(const TestConfig& config); + + /** + * @brief Execute a single test in the child process + * @param config Test configuration + * @return Exit code (0 for success, non-zero for failure) + */ + static int runTestInProcess(const TestConfig& config); + + /** + * @brief Create pipes for capturing process output + * @param stdoutPipe Array to hold stdout pipe file descriptors [read, write] + * @param stderrPipe Array to hold stderr pipe file descriptors [read, write] + * @return True if pipes were created successfully, false otherwise + */ + static bool createOutputPipes(int stdoutPipe[2], int stderrPipe[2]); + + /** + * @brief Redirect child process output to pipes + * @param stdoutPipe Stdout pipe file descriptors [read, write] + * @param stderrPipe Stderr pipe file descriptors [read, write] + */ + static void redirectOutputToPipes(int stdoutPipe[2], int stderrPipe[2]); + + /** + * @brief Capture output from child process via pipes + * @param stdoutPipe Stdout pipe file descriptors [read, write] + * @param stderrPipe Stderr pipe file descriptors [read, write] + * @param pid Child process ID to monitor + * @param status Pointer to status variable for waitpid + * @return Captured output from stdout and stderr + */ + static CapturedOutput + captureProcessOutput(int stdoutPipe[2], int stderrPipe[2], pid_t pid, int* status); + + /** + * @brief Display captured output with formatted delimiters + * @param output Captured output to display + * @param testName Name of the test for context + */ + static void displayCapturedOutput(const CapturedOutput& output, const std::string& testName); + +public: + /** + * @brief Register a test configuration + * @param config Complete test configuration + */ + static void registerTest(const TestConfig& config); + + /** + * @brief Register a simple test with just name and logic + * @param name Test name + * @param testLogic Test function to execute + */ + static void registerTest(const std::string& name, std::function testLogic); + + /** + * @brief Register a test with environment variables + * @param name Test name + * @param testLogic Test function to execute + * @param env Environment variables to set for this test + */ + static void registerTest( + const std::string& name, + std::function testLogic, + const std::unordered_map& env + ); + + /** + * @brief Record a test result (thread-safe) + * @param result Test result to record + */ + static void recordTestResult(const TestResult& result); + + /** + * @brief Execute all registered tests sequentially + * @param options Execution options (defaults to continue on failure) + * @return True if all tests passed, false otherwise + * @note This method automatically clears all test registrations and results + * after execution, ensuring a clean state for the next test suite. + */ + static bool executeAllTests(const ExecutionOptions& options = ExecutionOptions()); + + /** + * @brief Generate and display test report + * @param options Execution options used for the test run + * @return True if all tests passed, false otherwise + */ + static bool generateReport(const ExecutionOptions& options); + + /** + * @brief Get detailed test results (thread-safe) + * @return Vector of all test results + */ + static std::vector getTestResults(); + + /** + * @brief Clear test registry and results (thread-safe) + * @note Calling this method manually is typically not necessary, as + * executeAllTests() automatically clears registrations after execution. + * This method is primarily useful for advanced use cases or when tests + * are registered but not executed. + */ + static void clear(); + + /** + * @brief Get number of registered tests + * @return Number of registered tests + */ + static size_t getTestCount(); +}; + +// Macros for Simplified Usage + +/** + * @brief Register and execute a single isolated test with minimal boilerplate + * + * Uses variadic macros to automatically handle commas in lambda bodies + * + * @param test_name Name of the test (string) + * @param ... Lambda containing test logic (variadic to handle internal commas) + * + * Example: + * RUN_ISOLATED_TEST("MyTest", []() { + * EXPECT_TRUE(someFunction()); + * }); + */ +#define RUN_ISOLATED_TEST(test_name, ...) \ + do \ + { \ + ::RcclUnitTesting::ProcessIsolatedTestRunner::registerTest(test_name, __VA_ARGS__); \ + bool passed_ = ::RcclUnitTesting::ProcessIsolatedTestRunner::executeAllTests(); \ + EXPECT_TRUE(passed_) << "Isolated test '" << test_name << "' failed"; \ + } \ + while(0) + +/** + * @brief Register and execute a single isolated test with environment variables + * + * Uses variadic macros to automatically handle environment variable initializer lists + * + * @param test_name Name of the test (string) + * @param test_body Lambda containing test logic + * @param ... Environment variables as initializer list + * + * Example: + * RUN_ISOLATED_TEST_WITH_ENV("MyTest", + * []() { EXPECT_TRUE(someFunction()); }, + * {{"VAR1", "value1"}, {"VAR2", "value2"}}); + * + * Note: Uses __VA_ARGS__ to capture environment variables, which automatically + * handles commas in the initializer list without requiring extra parentheses. + */ +#define RUN_ISOLATED_TEST_WITH_ENV(test_name, test_body, ...) \ + do \ + { \ + ::RcclUnitTesting::ProcessIsolatedTestRunner::registerTest( \ + test_name, \ + test_body, \ + __VA_ARGS__ \ + ); \ + bool passed_ = ::RcclUnitTesting::ProcessIsolatedTestRunner::executeAllTests(); \ + EXPECT_TRUE(passed_) << "Isolated test '" << test_name << "' failed"; \ + } \ + while(0) + +/** + * @brief Register and execute multiple isolated tests with default options + * + * This macro takes multiple TestConfig objects and executes them all. + * Tests are automatically cleaned up after execution. + * + * Example: + * RUN_ISOLATED_TESTS( + * ProcessIsolatedTestRunner::TestConfig("Test1", []() { ... }), + * ProcessIsolatedTestRunner::TestConfig("Test2", []() { ... }) + * .withEnvironment({{"VAR", "value"}}), + * ProcessIsolatedTestRunner::TestConfig("Test3", []() { ... }) + * .withTimeout(std::chrono::seconds(60)) + * ); + */ +#define RUN_ISOLATED_TESTS(...) \ + do \ + { \ + ::RcclUnitTesting::ProcessIsolatedTestRunner::TestConfig configs_[] = {__VA_ARGS__}; \ + for(const auto& config_ : configs_) \ + { \ + ::RcclUnitTesting::ProcessIsolatedTestRunner::registerTest(config_); \ + } \ + bool passed_ = ::RcclUnitTesting::ProcessIsolatedTestRunner::executeAllTests(); \ + EXPECT_TRUE(passed_) << "One or more isolated tests failed"; \ + } \ + while(0) + +/** + * @brief Register and execute multiple isolated tests with custom options + * + * This macro takes execution options and multiple TestConfig objects. + * + * Example: + * ProcessIsolatedTestRunner::ExecutionOptions opts; + * opts.stopOnFirstFailure = true; + * opts.verboseLogging = true; + * + * RUN_ISOLATED_TESTS_WITH_OPTIONS(opts, + * ProcessIsolatedTestRunner::TestConfig("Test1", []() { ... }), + * ProcessIsolatedTestRunner::TestConfig("Test2", []() { ... }) + * ); + */ +#define RUN_ISOLATED_TESTS_WITH_OPTIONS(options, ...) \ + do \ + { \ + ::RcclUnitTesting::ProcessIsolatedTestRunner::TestConfig configs_[] = {__VA_ARGS__}; \ + for(const auto& config_ : configs_) \ + { \ + ::RcclUnitTesting::ProcessIsolatedTestRunner::registerTest(config_); \ + } \ + bool passed_ = ::RcclUnitTesting::ProcessIsolatedTestRunner::executeAllTests(options); \ + EXPECT_TRUE(passed_) << "One or more isolated tests failed"; \ + } \ + while(0) + +} // namespace RcclUnitTesting diff --git a/projects/rccl/test/common/ProcessIsolatedTestRunner.md b/projects/rccl/test/common/ProcessIsolatedTestRunner.md new file mode 100644 index 0000000000..63d1fabe91 --- /dev/null +++ b/projects/rccl/test/common/ProcessIsolatedTestRunner.md @@ -0,0 +1,1130 @@ +# Process Isolated Test Runner + +A lightweight C++ testing framework for running Google Test cases in isolated processes with clean environment settings. + +## Table of Contents +- [Overview](#overview) +- [Why Use Process Isolation?](#why-use-process-isolation) +- [Quick Start](#quick-start) +- [Core Concepts](#core-concepts) +- [API Reference](#api-reference) +- [Examples](#examples) +- [Best Practices](#best-practices) +- [Troubleshooting](#troubleshooting) + +--- + +## Overview + +`ProcessIsolatedTestRunner` is a framework that executes tests in separate processes using `fork()`. This ensures complete isolation between tests, particularly useful when testing code with static variables or environment-dependent behavior. + +**Key Features:** +- ✅ Process-based test isolation (each test runs in its own process) +- ✅ Per-test environment variable management +- ✅ Configurable timeouts +- ✅ Sequential or stop-on-failure execution +- ✅ Thread-safe test registration +- ✅ Detailed test result reporting + +**Location:** `test/common/ProcessIsolatedTestRunner.hpp` + +--- + +## Why use Process Isolation? + +### Problem: Static Variable Pollution + +Consider this RCCL code with static variables: + +```cpp +void rcclSetP2pNetChunkSize(struct ncclComm* comm, int& chunkSize) { + static int p2pNetChunkSize = RCCL_VALUE_UNSET; // ← Static variable! + + if (p2pNetChunkSize == RCCL_VALUE_UNSET) { + const char* inputStr = getenv("NCCL_P2P_NET_CHUNKSIZE"); + if (inputStr) { + // Parse the environment variable value + p2pNetChunkSize = parseValue(inputStr); // e.g., "12345" → 12345 + } else { + // No env var set, calculate value based on architecture... + p2pNetChunkSize = calculateValue(); + } + } + chunkSize = p2pNetChunkSize; +} +``` + +**How the static variable gets set:** +1. First time called: `p2pNetChunkSize == RCCL_VALUE_UNSET` is true +2. Code reads environment variable with `getenv("NCCL_P2P_NET_CHUNKSIZE")` +3. If env var exists → parse its value (e.g., "12345" string) and assign to static variable +4. If env var doesn't exist → calculate default value and assign to static variable +5. Static variable is now set and **persists for the lifetime of the process** + +**Without Process Isolation:** +```cpp +TEST(MyTest, FirstTest) { + setenv("NCCL_P2P_NET_CHUNKSIZE", "12345", 1); + rcclSetP2pNetChunkSize(comm, chunkSize); + // ✓ getenv() returns "12345" + // ✓ Static variable p2pNetChunkSize gets set to 12345 + // ✓ chunkSize is now 12345 +} + +TEST(MyTest, SecondTest) { + unsetenv("NCCL_P2P_NET_CHUNKSIZE"); + rcclSetP2pNetChunkSize(comm, chunkSize); + // ❌ getenv() returns nullptr (env var cleared) + // ❌ BUT: p2pNetChunkSize != RCCL_VALUE_UNSET (still 12345 from FirstTest!) + // ❌ Code skips the if-block, never reads env var or recalculates + // ❌ chunkSize is STILL 12345 from previous test! + // This test will fail or produce incorrect results +} +``` + +**The Problem:** Static variables are initialized once per process and persist across multiple tests. Even if you change or clear environment variables, the static variable retains its old value. + +**With Process Isolation:** +```cpp +// Each test runs in a separate process +// Static variables are reset for each test +// ✅ Tests are truly independent +``` + +### Common Use Cases + +1. **Testing environment variable behavior** - When code reads env vars into static variables +2. **Testing architecture-specific logic** - Different GPU architectures with cached state +3. **Testing initialization code** - One-time initialization patterns +4. **Testing configuration changes** - When config is cached statically + +--- + +## Quick Start + +### Basic Example (Using Macros) + +The simplest way to use ProcessIsolatedTestRunner is with the macros: + +```cpp +#include "common/ProcessIsolatedTestRunner.hpp" + +TEST(Rcclwrap, MyIsolatedTest) { + // Single test with environment variables - all in one call! + RUN_ISOLATED_TEST_WITH_ENV("TestWithCleanEnvironment", + []() { + // This runs in a separate process + const char* value = getenv("MY_VARIABLE"); + EXPECT_STREQ(value, "test_value"); + EXPECT_TRUE(someFunction()); + }, + {{"MY_VARIABLE", "test_value"}} + ); +} + +TEST(Rcclwrap, MyIsolatedTests) { + // Multiple tests with different configurations + RUN_ISOLATED_TESTS( + ProcessIsolatedTestRunner::TestConfig("Test1", []() { + EXPECT_TRUE(checkCondition1()); + }), + ProcessIsolatedTestRunner::TestConfig("Test2", []() { + EXPECT_TRUE(checkCondition2()); + }).withEnvironment({{"VAR", "value"}}), + ProcessIsolatedTestRunner::TestConfig("Test3", []() { + EXPECT_TRUE(checkCondition3()); + }).withTimeout(std::chrono::seconds(60)) + ); +} +``` + +### Manual API (For Advanced Use Cases) + +You can also use the API directly for more control: + +```cpp +#include "common/ProcessIsolatedTestRunner.hpp" + +TEST(Rcclwrap, MyIsolatedTests) { + // Register a test with environment variables + ProcessIsolatedTestRunner::registerTest( + ProcessIsolatedTestRunner::TestConfig( + "TestWithCleanEnvironment", + []() { + // This runs in a separate process + const char* value = getenv("MY_VARIABLE"); + EXPECT_STREQ(value, "test_value"); + + // Your test logic here + EXPECT_TRUE(someFunction()); + }) + .withEnvironment({{"MY_VARIABLE", "test_value"}}) + ); + + // Execute all registered tests + bool allTestsPassed = ProcessIsolatedTestRunner::executeAllTests(); + EXPECT_TRUE(allTestsPassed); +} +``` + +--- + +## Core Concepts + +### 1. Test Configuration (`TestConfig`) + +Defines how a test should be executed: + +```cpp +TestConfig config( + "TestName", // Test name (for reporting) + []() { /* logic */ } // Test function (lambda or function pointer) +); + +// Optional configurations +config.withEnvironment({{"VAR1", "value1"}, {"VAR2", "value2"}}) + .withTimeout(std::chrono::seconds(60)) + .withCleanEnvironment(false); // Inherit parent environment +``` + +### 2. Test Registration + +Tests must be registered before execution: + +```cpp +// Method 1: Full configuration +ProcessIsolatedTestRunner::registerTest(config); + +// Method 2: Simple (name + logic only) +ProcessIsolatedTestRunner::registerTest("SimplTest", []() { + EXPECT_TRUE(true); +}); + +// Method 3: With environment +ProcessIsolatedTestRunner::registerTest( + "EnvTest", + []() { /* logic */ }, + {{"ENV_VAR", "value"}} +); +``` + +### 3. Test Execution + +**⚠️ IMPORTANT:** Tests do NOT run automatically after registration. You **MUST** explicitly call `executeAllTests()` to run them. + +Execute all registered tests: + +```cpp +// Default options (continue on failure, no verbose logging) +bool passed = ProcessIsolatedTestRunner::executeAllTests(); + +// Custom options +ProcessIsolatedTestRunner::ExecutionOptions options; +options.stopOnFirstFailure = true; // Stop after first failure +options.verboseLogging = true; // Print detailed logs + +bool passed = ProcessIsolatedTestRunner::executeAllTests(options); +``` + +**Common Mistake:** +```cpp +// ❌ BAD: Tests registered but never executed! +TEST(MyTest, IsolatedTests) { + ProcessIsolatedTestRunner::registerTest("Test1", []() { /* ... */ }); + ProcessIsolatedTestRunner::registerTest("Test2", []() { /* ... */ }); + // Missing executeAllTests() - tests will NOT run! +} + +// ✅ GOOD: Tests registered and executed +TEST(MyTest, IsolatedTests) { + ProcessIsolatedTestRunner::registerTest("Test1", []() { /* ... */ }); + ProcessIsolatedTestRunner::registerTest("Test2", []() { /* ... */ }); + bool passed = ProcessIsolatedTestRunner::executeAllTests(); + EXPECT_TRUE(passed); +} +``` + +### 4. Test Results + +Each test produces a `TestResult`: + +```cpp +struct TestResult { + std::string testName; // Name of the test + bool passed; // Whether the test passed + bool skipped; // Whether the test was skipped + int exitCode; // Process exit code + pid_t processId; // Process ID that ran the test + std::chrono::milliseconds duration; // Execution duration + std::string errorMessage; // Error message if failed + std::unordered_map environment; // Env used +}; +``` + +--- + +## API Reference + +### Macros (Recommended) + +These macros provide the simplest way to use ProcessIsolatedTestRunner with minimal boilerplate. + +#### `RUN_ISOLATED_TEST(test_name, test_body)` +Register and execute a single isolated test. + +```cpp +RUN_ISOLATED_TEST("MySimpleTest", []() { + EXPECT_TRUE(someFunction()); +}); +``` + +#### `RUN_ISOLATED_TEST_WITH_ENV(test_name, test_body, ...)` +Register and execute a single isolated test with environment variables. + +**Uses variadic macros** (`...` and `__VA_ARGS__`) to automatically handle commas in initializer lists without requiring extra parentheses. + +```cpp +RUN_ISOLATED_TEST_WITH_ENV("MyEnvTest", + []() { + const char* value = getenv("MY_VAR"); + EXPECT_STREQ(value, "expected_value"); + }, + {{"MY_VAR", "expected_value"}} +); + +// Multiple environment variables work naturally: +RUN_ISOLATED_TEST_WITH_ENV("MultiEnvTest", + []() { /* test code */ }, + {{"VAR1", "val1"}, {"VAR2", "val2"}, {"VAR3", "val3"}} // Commas handled automatically +); +``` + +**Note:** The macro uses `__VA_ARGS__` internally, which automatically handles commas in the environment variable initializer list. Users don't need to worry about preprocessor comma issues. + +#### `RUN_ISOLATED_TESTS(...)` +Register and execute multiple isolated tests with various configurations. + +```cpp +RUN_ISOLATED_TESTS( + ProcessIsolatedTestRunner::TestConfig("Test1", []() { ... }), + ProcessIsolatedTestRunner::TestConfig("Test2", []() { ... }) + .withEnvironment({{"VAR", "value"}}), + ProcessIsolatedTestRunner::TestConfig("Test3", []() { ... }) + .withTimeout(std::chrono::seconds(60)) +); +``` + +#### `RUN_ISOLATED_TESTS_WITH_OPTIONS(options, ...)` +Register and execute multiple isolated tests with custom execution options. + +```cpp +ProcessIsolatedTestRunner::ExecutionOptions opts; +opts.stopOnFirstFailure = true; +opts.verboseLogging = true; + +RUN_ISOLATED_TESTS_WITH_OPTIONS(opts, + ProcessIsolatedTestRunner::TestConfig("Test1", []() { ... }), + ProcessIsolatedTestRunner::TestConfig("Test2", []() { ... }) +); +``` + +### Main Methods (For Manual Use) + +#### `registerTest()` +Register a test for later execution. + +```cpp +// Variant 1: Full configuration +static void registerTest(const TestConfig& config); + +// Variant 2: Simple registration +static void registerTest( + const std::string& name, + std::function testLogic +); + +// Variant 3: With environment +static void registerTest( + const std::string& name, + std::function testLogic, + const std::unordered_map& env +); +``` + +#### `executeAllTests()` +Execute all registered tests sequentially. + +```cpp +static bool executeAllTests( + const ExecutionOptions& options = ExecutionOptions() +); +``` + +**Returns:** `true` if all tests passed, `false` if any failed. + +**Note:** This method automatically clears all test registrations and results after execution, ensuring a clean state for the next test suite. Users do not need to call `clear()` manually. + +#### `getTestResults()` +Retrieve detailed results from the last execution. + +```cpp +static std::vector getTestResults(); +``` + +#### `clear()` +Clear all registered tests and results. + +```cpp +static void clear(); +``` + +**Note:** Calling this method manually is typically not necessary, as `executeAllTests()` automatically clears registrations after execution. This method is primarily useful for advanced use cases or when tests are registered but not executed. + +**⚠️ Automatic Warning:** If `clear()` is called when tests have been registered but not fully executed, it will automatically print a warning to stderr: + +``` +⚠️ WARNING: ProcessIsolatedTestRunner::clear() called with 2 unexecuted test(s)! + Registered: 2 test(s) + Executed: 0 test(s) + Did you forget to call executeAllTests()? +``` + +#### `getTestCount()` +Get the number of currently registered tests (before execution). + +```cpp +static size_t getTestCount(); +``` + +**Use case:** Verify that tests were actually registered and executed. + +```cpp +TEST(MyTest, VerifyExecution) { + ProcessIsolatedTestRunner::clear(); + + // Register tests + ProcessIsolatedTestRunner::registerTest("Test1", []() { /* ... */ }); + ProcessIsolatedTestRunner::registerTest("Test2", []() { /* ... */ }); + + // Check registration count + size_t registeredCount = ProcessIsolatedTestRunner::getTestCount(); + EXPECT_EQ(registeredCount, 2) << "Expected 2 tests to be registered"; + + // Execute + bool passed = ProcessIsolatedTestRunner::executeAllTests(); + EXPECT_TRUE(passed); + + // Verify execution count + auto results = ProcessIsolatedTestRunner::getTestResults(); + EXPECT_EQ(results.size(), registeredCount) + << "Registered " << registeredCount << " tests but only " + << results.size() << " executed"; +} +``` + +### TestConfig Methods + +#### `withEnvironment()` +Set environment variables for the test. + +```cpp +TestConfig& withEnvironment( + const std::unordered_map& env +); +``` + +**Note:** Variables are set in the child process only. + +#### `withTimeout()` +Set a timeout for test execution. + +```cpp +TestConfig& withTimeout(std::chrono::seconds timeoutSeconds); +``` + +**Default:** 30 seconds + +#### `withCleanEnvironment()` +Control whether to inherit parent process environment. + +```cpp +TestConfig& withCleanEnvironment(bool inherit = true); +``` + +**Default:** `true` (inherits parent environment) + +--- + +## Examples + +**Note:** The examples below use helper functions from `RcclWrapTests.cpp`: + +```cpp +// Helper to create a mock NCCL communicator with specified architecture and ranks +static void CreateMockComm(ncclComm_t &mockComm, + struct ncclTopoSystem &mockTopo, + struct ncclTopoNode &mockGpuNode, + const char *arch, + int nRanks); + +// Helper to cleanup a mock communicator +static void CleanupMockComm(ncclComm_t &mockComm); +``` + +### Example 1: Testing Environment Variable Behavior + +```cpp +TEST(Rcclwrap, EnvironmentVariableTests) { + // Test 1: With environment variable set + ProcessIsolatedTestRunner::registerTest( + ProcessIsolatedTestRunner::TestConfig( + "WithEnvVarSet", + []() { + ncclComm_t mockComm = nullptr; + struct ncclTopoSystem mockTopo; + struct ncclTopoNode mockGpuNode; + CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx942", 128); + + int chunkSize = RCCL_VALUE_UNSET; + rcclSetP2pNetChunkSize(mockComm, chunkSize); + + // Should use default architecture-based value + EXPECT_EQ(chunkSize, 1 << 19); + + CleanupMockComm(mockComm); + }) + .withEnvironment({{"NCCL_P2P_NET_CHUNKSIZE", "999999"}}) + ); + + // Test 2: Without environment variable (clean state) + ProcessIsolatedTestRunner::registerTest( + ProcessIsolatedTestRunner::TestConfig( + "WithoutEnvVar", + []() { + // Verify environment is clean + const char* value = getenv("NCCL_P2P_NET_CHUNKSIZE"); + EXPECT_EQ(value, nullptr); + + // Test default behavior + ncclComm_t mockComm = nullptr; + struct ncclTopoSystem mockTopo; + struct ncclTopoNode mockGpuNode; + CreateMockComm(mockComm, mockTopo, mockGpuNode, "gfx942", 32); + + int chunkSize = RCCL_VALUE_UNSET; + rcclSetP2pNetChunkSize(mockComm, chunkSize); + EXPECT_EQ(chunkSize, 1 << 17); // Default for < 64 ranks + + CleanupMockComm(mockComm); + }) + ); + + // Execute both tests in isolated processes + bool passed = ProcessIsolatedTestRunner::executeAllTests(); + EXPECT_TRUE(passed); +} +``` + +### Example 2: Testing Multiple Architectures + +```cpp +TEST(Rcclwrap, ArchitectureTests) { + struct TestCase { + std::string name; + std::string arch; + int ranks; + int expectedChunkSize; + }; + + std::vector testCases = { + {"GFX942_SmallRanks", "gfx942", 32, 1 << 17}, + {"GFX942_LargeRanks", "gfx942", 128, 1 << 19}, + {"GFX950_SmallRanks", "gfx950", 8, 1 << 17}, + {"GFX950_MediumRanks", "gfx950", 24, 1 << 18}, + {"GFX950_LargeRanks", "gfx950", 64, 1 << 19}, + }; + + for (const auto& tc : testCases) { + ProcessIsolatedTestRunner::registerTest( + ProcessIsolatedTestRunner::TestConfig( + tc.name, + [tc]() { + ncclComm_t mockComm = nullptr; + struct ncclTopoSystem mockTopo; + struct ncclTopoNode mockGpuNode; + CreateMockComm(mockComm, mockTopo, mockGpuNode, tc.arch.c_str(), tc.ranks); + + int chunkSize = RCCL_VALUE_UNSET; + rcclSetP2pNetChunkSize(mockComm, chunkSize); + + EXPECT_EQ(chunkSize, tc.expectedChunkSize) + << "Failed for " << tc.arch << " with " << tc.ranks << " ranks"; + + CleanupMockComm(mockComm); + }) + ); + } + + ProcessIsolatedTestRunner::ExecutionOptions options; + options.verboseLogging = true; + options.stopOnFirstFailure = false; // Run all tests even if one fails + + bool passed = ProcessIsolatedTestRunner::executeAllTests(options); + EXPECT_TRUE(passed); +} +``` + +### Example 3: Testing with Timeouts + +```cpp +TEST(Rcclwrap, TimeoutHandling) { + // Test that completes quickly + ProcessIsolatedTestRunner::registerTest( + ProcessIsolatedTestRunner::TestConfig( + "FastTest", + []() { + EXPECT_TRUE(true); + }) + .withTimeout(std::chrono::seconds(5)) + ); + + // Test with longer timeout for complex operations + ProcessIsolatedTestRunner::registerTest( + ProcessIsolatedTestRunner::TestConfig( + "SlowTest", + []() { + // Simulate slow operation + std::this_thread::sleep_for(std::chrono::seconds(2)); + EXPECT_TRUE(true); + }) + .withTimeout(std::chrono::seconds(10)) + ); + + bool passed = ProcessIsolatedTestRunner::executeAllTests(); + EXPECT_TRUE(passed); +} +``` + +### Example 4: Stop on First Failure + +```cpp +TEST(Rcclwrap, CriticalTests) { + // Register multiple critical tests + ProcessIsolatedTestRunner::registerTest( + "CriticalTest1", []() { EXPECT_TRUE(checkCriticalCondition1()); }); + + ProcessIsolatedTestRunner::registerTest( + "CriticalTest2", []() { EXPECT_TRUE(checkCriticalCondition2()); }); + + ProcessIsolatedTestRunner::registerTest( + "CriticalTest3", []() { EXPECT_TRUE(checkCriticalCondition3()); }); + + // Stop on first failure - don't waste time if critical tests fail + ProcessIsolatedTestRunner::ExecutionOptions options; + options.stopOnFirstFailure = true; + + bool passed = ProcessIsolatedTestRunner::executeAllTests(options); + EXPECT_TRUE(passed) << "Critical test suite failed"; +} +``` + +--- + +## Best Practices + +### 1. Use Macros for Simple Cases + +```cpp +// ✅ GOOD: Simple and clean using macros +TEST(MyTest, SimpleIsolatedTest) { + RUN_ISOLATED_TEST("CheckSomething", []() { + EXPECT_TRUE(checkSomething()); + }); +} + +// ❌ MORE VERBOSE: Manual registration (still valid for complex cases) +TEST(MyTest, SimpleIsolatedTest) { + ProcessIsolatedTestRunner::registerTest("CheckSomething", []() { + EXPECT_TRUE(checkSomething()); + }); + bool passed = ProcessIsolatedTestRunner::executeAllTests(); + EXPECT_TRUE(passed); +} +``` + +### 2. Always Execute Registered Tests (When Using Manual API) + +```cpp +TEST(MyTest, IsolatedTests) { + // Register tests + ProcessIsolatedTestRunner::registerTest(/* ... */); + + // ✅ IMPORTANT: Don't forget to execute! + bool passed = ProcessIsolatedTestRunner::executeAllTests(); + EXPECT_TRUE(passed); +} +``` + +**When Using Manual API (Optional Verification):** + +You can verify that tests were registered and executed: + +```cpp +TEST(MyTest, IsolatedTests) { + // Register tests + ProcessIsolatedTestRunner::registerTest("Test1", []() { /* ... */ }); + ProcessIsolatedTestRunner::registerTest("Test2", []() { /* ... */ }); + + // Get count of registered tests + size_t registeredCount = ProcessIsolatedTestRunner::getTestCount(); + EXPECT_EQ(registeredCount, 2) << "Expected 2 tests to be registered"; + + // Execute all tests (automatically clears after execution) + bool passed = ProcessIsolatedTestRunner::executeAllTests(); + EXPECT_TRUE(passed); + + // Optional: Verify execution count matches registration count + auto results = ProcessIsolatedTestRunner::getTestResults(); + EXPECT_EQ(results.size(), registeredCount) + << "Registered " << registeredCount << " but executed " << results.size(); +} +``` + +### 3. Use Descriptive Test Names + +```cpp +// ❌ BAD: Vague name +RUN_ISOLATED_TEST("Test1", []() { /* ... */ }); + +// ✅ GOOD: Descriptive name +RUN_ISOLATED_TEST("GFX942_LargeRanks_P2PChunkSize_ExpectHighValue", + []() { /* ... */ } +); +``` + +### 4. Group Related Tests + +```cpp +TEST(Rcclwrap, AllP2PChunkSizeTests) { + // Using macros to group related tests + RUN_ISOLATED_TESTS( + ProcessIsolatedTestRunner::TestConfig("GFX942_Test1", []() { ... }), + ProcessIsolatedTestRunner::TestConfig("GFX942_Test2", []() { ... }), + ProcessIsolatedTestRunner::TestConfig("GFX950_Test1", []() { ... }), + ProcessIsolatedTestRunner::TestConfig("GFX950_Test2", []() { ... }) + ); +} +``` + +### 5. Use Options for Better Control + +```cpp +// For debugging: verbose + stop on failure +ProcessIsolatedTestRunner::ExecutionOptions debugOptions; +debugOptions.stopOnFirstFailure = true; +debugOptions.verboseLogging = true; + +RUN_ISOLATED_TESTS_WITH_OPTIONS(debugOptions, + ProcessIsolatedTestRunner::TestConfig("Test1", []() { ... }), + ProcessIsolatedTestRunner::TestConfig("Test2", []() { ... }) +); + +// For CI: run all tests, collect all failures +ProcessIsolatedTestRunner::ExecutionOptions ciOptions; +ciOptions.stopOnFirstFailure = false; +ciOptions.verboseLogging = false; + +RUN_ISOLATED_TESTS_WITH_OPTIONS(ciOptions, + ProcessIsolatedTestRunner::TestConfig("Test1", []() { ... }), + ProcessIsolatedTestRunner::TestConfig("Test2", []() { ... }) +); +``` + +### 6. Set Appropriate Timeouts + +```cpp +// ✅ GOOD: Different timeouts for different test types +RUN_ISOLATED_TESTS( + ProcessIsolatedTestRunner::TestConfig("QuickTest", []() { ... }) + .withTimeout(std::chrono::seconds(5)), + ProcessIsolatedTestRunner::TestConfig("NormalTest", []() { ... }) + .withTimeout(std::chrono::seconds(30)), + ProcessIsolatedTestRunner::TestConfig("SlowTest", []() { ... }) + .withTimeout(std::chrono::seconds(120)) +); + +// ❌ BAD: Same long timeout for everything +RUN_ISOLATED_TESTS( + ProcessIsolatedTestRunner::TestConfig("Test1", []() { ... }) + .withTimeout(std::chrono::seconds(300)), + ProcessIsolatedTestRunner::TestConfig("Test2", []() { ... }) + .withTimeout(std::chrono::seconds(300)) +); +``` + +### 7. Clean Up Resources in Tests + +```cpp +RUN_ISOLATED_TEST("ResourceTest", []() { + ncclComm_t comm = nullptr; + struct ncclTopoSystem topo; + struct ncclTopoNode gpuNode; + CreateMockComm(comm, topo, gpuNode, "gfx942", 32); + + try { + // Your test logic + EXPECT_TRUE(someTest(comm)); + + // ✅ GOOD: Clean up in all paths + CleanupMockComm(comm); + } catch (...) { + CleanupMockComm(comm); + throw; + } +}); +``` + +### 8. Use RAII for GPU Resource Management + +When tests allocate GPU memory, use RAII wrappers to ensure cleanup: + +```cpp +// ✅ GOOD: RAII ensures cleanup even on failure +struct GPUBuffer { + void* ptr = nullptr; + size_t size; + + GPUBuffer(size_t s) : size(s) { + hipError_t err = hipMalloc(&ptr, size); + ASSERT_EQ(err, hipSuccess); + } + + ~GPUBuffer() { + if (ptr) { + hipFree(ptr); + ptr = nullptr; + } + } + + // Prevent copying + GPUBuffer(const GPUBuffer&) = delete; + GPUBuffer& operator=(const GPUBuffer&) = delete; +}; + +RUN_ISOLATED_TEST("GPUTest", []() { + GPUBuffer buffer(1024); // Automatically cleaned up + // ... test logic ... + // No manual cleanup needed - destructor handles it +}); + +// ❌ BAD: Manual cleanup can be forgotten +RUN_ISOLATED_TEST("GPUTest", []() { + void* buffer; + hipMalloc(&buffer, 1024); + // ... test logic ... + // If test fails before this line, buffer leaks! + hipFree(buffer); +}); +``` + +### 9. Avoid GPU Initialization in Test Fixtures + +When using process isolation, avoid initializing GPU resources in test fixture `SetUp()` methods: + +```cpp +// ❌ BAD: GPU initialization in fixture (runs in parent process) +class GPUTests : public ::testing::Test { +protected: + void SetUp() override { + hipMalloc(&gpuBuffer, 1024); // Parent process - will pollute fork()! + } + void* gpuBuffer; +}; + +// ✅ GOOD: GPU initialization inside isolated test +class GPUTests : public ::testing::Test { + // Empty fixture or only CPU resources in SetUp() +}; + +TEST_F(GPUTests, MyTest) { + RUN_ISOLATED_TEST("GPUOperation", []() { + void* gpuBuffer; + hipMalloc(&gpuBuffer, 1024); // Child process only - safe! + // ... test logic ... + hipFree(gpuBuffer); + }); +} + +// ✅ EVEN BETTER: Use RAII + helper structure +struct GPUTestEnvironment { + void* buffer; + void setup() { hipMalloc(&buffer, 1024); } + void cleanup() { if (buffer) hipFree(buffer); } + ~GPUTestEnvironment() { cleanup(); } +}; + +TEST_F(GPUTests, MyTest) { + RUN_ISOLATED_TEST("GPUOperation", []() { + GPUTestEnvironment env; + env.setup(); + // ... test logic ... + env.cleanup(); // Explicit + destructor cleanup + }); +} +``` + +--- + +## Troubleshooting + +### Test Hangs / Times Out + +**Symptom:** Test never completes, eventually times out. + +**Solutions:** +1. Increase timeout: `.withTimeout(std::chrono::seconds(120))` +2. Check for deadlocks in test logic +3. Enable verbose logging to see where it hangs: + ```cpp + options.verboseLogging = true; + ``` + +### Environment Variables Not Being Set + +**Symptom:** `getenv()` returns `nullptr` in test. + +**Solutions:** +1. Verify environment variable name is correct +2. Check that you're calling `withEnvironment()`: + ```cpp + config.withEnvironment({{"VAR_NAME", "value"}}) + ``` +3. Verify the test is actually executing (check test name) + +### Tests Pass Individually but Fail Together + +**Symptom:** Individual tests pass, but fail when run in a suite. + +**Cause:** This is the **exact problem** that ProcessIsolatedTestRunner solves! + +**Solution:** Already solved - each test runs in isolated process. If you're still seeing this, check: +1. Are you using `executeAllTests()` correctly? +2. Are there shared external resources (files, network, etc.)? + +### Fork Failures + +**Symptom:** Error messages about fork() failing. + +**Solutions:** +1. Check system resource limits: `ulimit -u` (max processes) +2. Reduce number of tests or run in smaller batches +3. Check for resource leaks in parent process + +### Test Results Not Available + +**Symptom:** `getTestResults()` returns empty vector. + +**Solution:** +```cpp +// Call executeAllTests() first +ProcessIsolatedTestRunner::executeAllTests(); + +// Then get results +auto results = ProcessIsolatedTestRunner::getTestResults(); +``` + +### Tests Registered but Never Executed + +**Symptom:** Tests pass but you suspect they didn't actually run. + +**Cause:** Forgot to call `executeAllTests()` after registration. + +**Detection:** +```cpp +TEST(MyTest, IsolatedTests) { + // Register tests + ProcessIsolatedTestRunner::registerTest("Test1", []() { EXPECT_TRUE(true); }); + ProcessIsolatedTestRunner::registerTest("Test2", []() { EXPECT_TRUE(true); }); + + // ❌ FORGOT TO CALL executeAllTests()! + + // Later, when the test ends, registered tests are lost +} +``` + +**Solution:** +```cpp +TEST(MyTest, IsolatedTests) { + // Register tests + ProcessIsolatedTestRunner::registerTest("Test1", []() { EXPECT_TRUE(true); }); + ProcessIsolatedTestRunner::registerTest("Test2", []() { EXPECT_TRUE(true); }); + + // ✅ ALWAYS execute registered tests + bool passed = ProcessIsolatedTestRunner::executeAllTests(); + EXPECT_TRUE(passed); + + // ✅ Optionally verify execution count + auto results = ProcessIsolatedTestRunner::getTestResults(); + EXPECT_EQ(results.size(), 2) << "Expected 2 tests to execute"; +} +``` + +**Prevention:** Always verify that `getTestResults().size()` matches your expected number of tests: +```cpp +// After execution +auto results = ProcessIsolatedTestRunner::getTestResults(); +EXPECT_EQ(results.size(), expectedTestCount) + << "Test count mismatch - some tests may not have executed"; +``` + +--- + +## Implementation Details + +### How It Works + +1. **Registration Phase:** + - Tests are registered into a static vector + - Each test gets a `TestConfig` with name, logic, and environment + +2. **Execution Phase:** + - Parent process iterates through registered tests + - For each test: + - `fork()` creates a child process + - Child applies environment variables + - Child executes test logic + - Parent waits for child to complete + - Result is collected and stored + +3. **Result Collection:** + - Exit codes are captured from child processes + - Timing information is recorded + - All results stored in static vector + +4. **Automatic Cleanup:** + - After execution completes, `executeAllTests()` automatically clears all test registrations and results + - This ensures a clean state for the next test suite without manual intervention + +### Exit Codes + +```cpp +enum RcclTestCode { + RCCL_TEST_SUCCESS = 0, // Test passed + RCCL_TEST_FAILURE = 1, // Test failed (assertion) + RCCL_TEST_UNKNOWN_EXCEPTION = 2, // Uncaught exception + RCCL_TEST_TIMEOUT = 3, // Test timed out + RCCL_TEST_SKIPPED = 4 // Test was skipped +}; +``` + +### Thread Safety + +The framework uses mutexes for thread-safe operations: +- Test registration (write) +- Result recording (write) +- Result retrieval (read) + +--- + +## Limitations + +1. **Process Overhead:** Each test creates a new process (fork overhead) +2. **Sequential Execution:** Tests run one at a time (not parallel) +3. **Linux/Unix Only:** Uses `fork()` - not available on Windows +4. **Memory Duplication:** Each forked process duplicates memory +5. **No Shared State:** Tests cannot share data between processes + +--- + +## FAQ + +**Q: When should I use ProcessIsolatedTestRunner vs regular Google Test?** + +A: Use ProcessIsolatedTestRunner when: +- Testing code with static variables +- Testing environment variable behavior +- Testing one-time initialization +- Need guaranteed clean state between tests + +Use regular Google Test when: +- Tests are truly independent +- No static state concerns +- Need parallel execution +- Testing simple units + +**Q: Can I use this with MPI tests?** + +A: Not directly. Process Isolated test runner is for single-process tests. For MPI tests, use `MPI Test Runner` instead. Process Isolated test runner is currently hooked into `rccl-UnitTestsFixtures` binary and MPI test runner is hooked into `rccl-UnitTestsMPI` binary. These are two independent implementation. + +**Q: How do I debug a test that's running in an isolated process?** + +A: +1. Enable verbose logging +2. Add print statements in your test lambda +3. Temporarily run the test logic outside the framework +4. Use GDB + +**Q: Can I run tests in parallel?** + +A: No, the current implementation only supports sequential execution. + +**Q: Does this work with CTest/CMake?** + +A: Yes! The tests are still Google Test cases, so they work with standard test runners. + +**Q: Should I use the macros or the manual API?** + +A: Use the macros (`RUN_ISOLATED_TEST`, `RUN_ISOLATED_TESTS`, etc.) for most cases - they're simpler and less error-prone. Use the manual API (`registerTest()` + `executeAllTests()`) only when you need more control over the registration/execution flow, such as: +- Dynamically generating test configurations at runtime +- Sharing test registration logic across multiple TEST blocks +- Advanced control flow scenarios + +**Q: Do tests run automatically after registration, or do I need to call executeAllTests()?** + +A: **You MUST call `executeAllTests()` explicitly.** Tests do NOT run automatically. If you forget to call it, your tests will be silently ignored. Always follow this pattern: + +```cpp +TEST(MyTest, IsolatedTests) { + ProcessIsolatedTestRunner::registerTest("MyTest", []() { /* ... */ }); + + // ✅ REQUIRED: Execute the tests + bool passed = ProcessIsolatedTestRunner::executeAllTests(); + EXPECT_TRUE(passed); +} +``` + +**Q: How can I detect if I forgot to execute registered tests?** + +A: After `executeAllTests()`, verify that `getTestResults().size()` matches your expected test count: + +```cpp +// Register N tests +ProcessIsolatedTestRunner::registerTest("Test1", []() { /* ... */ }); +ProcessIsolatedTestRunner::registerTest("Test2", []() { /* ... */ }); + +// Execute +bool passed = ProcessIsolatedTestRunner::executeAllTests(); + +// Verify count +auto results = ProcessIsolatedTestRunner::getTestResults(); +EXPECT_EQ(results.size(), 2) << "Expected 2 tests to run"; +``` + +**Q: Do I need to call clear() manually?** + +A: No. The `clear()` method is only useful for advanced use cases where you need to clear tests that were registered but never executed. If you manually call `clear()` when tests were registered but not executed, it will warn you: + +``` +⚠️ WARNING: ProcessIsolatedTestRunner::clear() called with 2 unexecuted test(s)! + Registered: 2 test(s) + Executed: 0 test(s) + Did you forget to call executeAllTests()? +``` + +--- + +## See Also + +- **ProcessIsolatedTestRunner.hpp** - Full API documentation +- **ProcessIsolatedTestRunner.cpp** - Implementation details +- **RcclWrapTests.cpp** - Usage examples