/************************************************************************* * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include #include #include #include "comm.h" #include "common/ProcessIsolatedTestRunner.hpp" #include "enqueue.h" #include "info.h" #include "utils.h" namespace RcclUnitTesting { // Simple test kernel for validating ncclInitKernelsForDevice __global__ void simpleTestKernel(int* data) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if(data) data[tid] = tid; } // Helper function to test ncclInitKernelsForDevice with a real kernel ncclResult_t testKernelAttributes(void* kernelFn, size_t* maxStackSize) { if(!kernelFn || !maxStackSize) return ncclInvalidArgument; *maxStackSize = 0; hipFuncAttributes attr = {0}; hipError_t errcode = hipFuncGetAttributes(&attr, kernelFn); if(errcode != hipSuccess) return ncclSystemError; *maxStackSize = attr.localSizeBytes; return ncclSuccess; // ncclSuccess } // Helper function to test shared memory limit checking with a real kernel // ncclMaxSharedMem: For gfx906 (cudaArch 906) with WarpSize 64, this is typically 32832 bytes ncclResult_t testKernelSharedMemoryLimit( void* kernelFn, int cudaArch, int maxSharedMem, size_t* maxStackSize, int ncclMaxSharedMem ) { if(!kernelFn) return ncclInvalidArgument; ncclResult_t result = ncclSuccess; if(maxStackSize) *maxStackSize = 0; hipFuncAttributes attr = {0}; hipError_t errcode = hipFuncGetAttributes(&attr, kernelFn); if(errcode != hipSuccess) { return ncclSystemError; } if(maxStackSize) { *maxStackSize = attr.localSizeBytes; } // Test the shared memory limit check (mimics enqueue.cc lines 135-146) if(ncclMaxSharedMem != 0) { int sharedMemSize = ncclMaxSharedMem; if(sharedMemSize > (maxSharedMem - attr.sharedSizeBytes)) { WARN( "cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu", cudaArch, sharedMemSize, maxSharedMem - attr.sharedSizeBytes ); return ncclSystemError; } } return result; } // Helper structure to hold test environment struct EnqueueTestEnvironment { ncclComm* comm; ncclInfo* info; void* sendbuff; void* recvbuff; uint32_t abortFlag0; uint32_t abortFlag1; int abortFlagRefCount; EnqueueTestEnvironment() : comm(nullptr) , info(nullptr) , sendbuff(nullptr) , recvbuff(nullptr) , abortFlag0(0) , abortFlag1(0) , abortFlagRefCount(0) {} ~EnqueueTestEnvironment() { cleanup(); } void setup() { // Allocate GPU memory for buffers size_t bufferSize = 1024 * sizeof(float); hipError_t hipErr = hipMalloc(&sendbuff, bufferSize); ASSERT_EQ(hipErr, hipSuccess) << "Failed to allocate sendbuff"; hipErr = hipMalloc(&recvbuff, bufferSize); ASSERT_EQ(hipErr, hipSuccess) << "Failed to allocate recvbuff"; // Initialize communicator comm = new ncclComm(); memset(comm, 0, sizeof(ncclComm)); comm->startMagic = NCCL_MAGIC; // 0x0280028002800280 // Initialize critical fields comm->rank = 0; comm->nRanks = 2; comm->cudaDev = 0; comm->localRank = 0; // Initialize abort flags comm->abortFlag = &abortFlag0; comm->childAbortFlag = &abortFlag1; comm->abortFlagRefCount = &abortFlagRefCount; // Initialize memory stack ncclMemoryStackConstruct(&comm->memScoped); ncclMemoryStackConstruct(&comm->memPermanent); // Initialize intra-communication pointers comm->intraComm0 = nullptr; comm->intraNext = nullptr; // Initialize work FIFO structures comm->workFifoBytes = 1024; // Power of 2 comm->workFifoBuf = nullptr; comm->workFifoBufDev = nullptr; comm->workFifoConsumed = 0; comm->workFifoProducedLastRecorded = 0; comm->workFifoProduced = 0; // Initialize planner memset(&comm->planner, 0, sizeof(comm->planner)); // Initialize config memset(&comm->config, 0, sizeof(comm->config)); comm->config.blocking = 1; comm->checkPointers = 0; // Disable pointer validation for easier testing // Initialize peer info arrays comm->peerInfo = new ncclPeerInfo[comm->nRanks]; memset(comm->peerInfo, 0, comm->nRanks * sizeof(ncclPeerInfo)); comm->localRankToRank = new int[comm->nRanks]; for(int i = 0; i < comm->nRanks; i++) { comm->localRankToRank[i] = i; } comm->endMagic = NCCL_MAGIC; // 0x0280028002800280 // Initialize operation info with valid GPU buffers info = new ncclInfo(); memset(info, 0, sizeof(ncclInfo)); info->comm = comm; info->opName = "AllReduce"; info->count = 1024; info->datatype = ncclFloat; info->op = ncclSum; info->root = 0; info->sendbuff = sendbuff; // Use allocated GPU memory info->recvbuff = recvbuff; // Use allocated GPU memory info->stream = nullptr; } void cleanup() { // Clean up info first (it references comm) if(info) { delete info; info = nullptr; } // Clean up comm and its allocated resources if(comm) { // Clean up memory stacks ncclMemoryStackDestruct(&comm->memScoped); ncclMemoryStackDestruct(&comm->memPermanent); // Clean up peer info arrays if(comm->peerInfo) { delete[] comm->peerInfo; comm->peerInfo = nullptr; } if(comm->localRankToRank) { delete[] comm->localRankToRank; comm->localRankToRank = nullptr; } delete comm; comm = nullptr; } // Clean up GPU buffers last if(sendbuff) { hipError_t err = hipFree(sendbuff); if(err != hipSuccess) { // Log error but don't throw in cleanup fprintf(stderr, "Warning: hipFree(sendbuff) failed with error %d\n", err); } sendbuff = nullptr; } if(recvbuff) { hipError_t err = hipFree(recvbuff); if(err != hipSuccess) { // Log error but don't throw in cleanup fprintf(stderr, "Warning: hipFree(recvbuff) failed with error %d\n", err); } recvbuff = nullptr; } } }; // Empty test fixture for test organization class EnqueueTests : public ::testing::Test { // No setup/teardown - all tests use process isolation }; // Test ncclInitKernelsForDevice function TEST_F(EnqueueTests, ncclInitKernelsForDevice_ValidInput) { ProcessIsolatedTestRunner::ExecutionOptions options; options.stopOnFirstFailure = false; // Continue running all tests options.verboseLogging = true; RUN_ISOLATED_TESTS_WITH_OPTIONS( options, ProcessIsolatedTestRunner::TestConfig( "ncclInitKernelsForDevice_ValidInput", [this]() { size_t maxStackSize = 0; ncclResult_t result = ncclInitKernelsForDevice(906, 65536, &maxStackSize); EXPECT_TRUE(result == ncclSuccess); // maxStackSize should be set to a reasonable value (> 0) EXPECT_GT(maxStackSize, 0) << "Expected maxStackSize to be computed and set to a positive value"; } ).withEnvironment({{"NCCL_DEBUG", "INFO"}, {"NCCL_DEBUG_SUBSYS", "ALL"}}), ProcessIsolatedTestRunner::TestConfig( "ncclInitKernelsForDevice_ValidInputCarveout", [this]() { size_t maxStackSize = 0; ncclResult_t result = ncclInitKernelsForDevice(906, 65536, &maxStackSize); EXPECT_TRUE(result == ncclSuccess); // maxStackSize should be set to a reasonable value (> 0) EXPECT_GT(maxStackSize, 0) << "Expected maxStackSize to be computed and set to a positive value"; } ) .withEnvironment( {{"NCCL_L1_SHARED_MEMORY_CARVEOUT", "1"}, {"NCCL_DEBUG", "INFO"}, {"NCCL_DEBUG_SUBSYS", "ALL"}} ) ); } TEST_F(EnqueueTests, ncclInitKernelsForDevice_NullStackSize) { ProcessIsolatedTestRunner::ExecutionOptions options; options.stopOnFirstFailure = false; options.verboseLogging = true; RUN_ISOLATED_TESTS_WITH_OPTIONS( options, ProcessIsolatedTestRunner::TestConfig( "ncclInitKernelsForDevice_NullStackSize", []() { ncclResult_t result = ncclInitKernelsForDevice(906, 65536, nullptr); EXPECT_EQ(result, ncclSuccess); } ) ); } // Test with a real compiled kernel to verify attribute retrieval works correctly TEST_F(EnqueueTests, KernelAttributes_WithRealKernel) { ProcessIsolatedTestRunner::ExecutionOptions options; options.stopOnFirstFailure = false; options.verboseLogging = true; RUN_ISOLATED_TESTS_WITH_OPTIONS( options, ProcessIsolatedTestRunner::TestConfig( "KernelAttributes_WithRealKernel", []() { size_t maxStackSize = 0; ncclResult_t result = testKernelAttributes((void*)simpleTestKernel, &maxStackSize); EXPECT_EQ(result, ncclSuccess) << "Expected successful kernel attribute retrieval with a real compiled kernel"; } ).withEnvironment({{"NCCL_DEBUG", "INFO"}}) ); } TEST_F(EnqueueTests, ncclInitKernelsForDevice_InvalidArch) { ProcessIsolatedTestRunner::ExecutionOptions options; options.stopOnFirstFailure = false; options.verboseLogging = true; RUN_ISOLATED_TESTS_WITH_OPTIONS( options, ProcessIsolatedTestRunner::TestConfig( "ncclInitKernelsForDevice_InvalidArch", []() { size_t maxStackSize = 0; ncclResult_t result = ncclInitKernelsForDevice(-1, 65536, &maxStackSize); EXPECT_EQ(result, ncclSuccess); } ) ); } TEST_F(EnqueueTests, ncclInitKernelsForDevice_ExceedsSharedMemory) { ProcessIsolatedTestRunner::ExecutionOptions options; options.stopOnFirstFailure = false; options.verboseLogging = true; RUN_ISOLATED_TESTS_WITH_OPTIONS( options, ProcessIsolatedTestRunner::TestConfig( "ncclInitKernelsForDevice_ExceedsSharedMemory", []() { size_t maxStackSize = 0; // For gfx906, ncclMaxSharedMem is 32832 (as shown in test output) // Use a very small maxSharedMem (16000 bytes) to trigger the exceeds check ncclResult_t result = testKernelSharedMemoryLimit( (void*)simpleTestKernel, // Use our real compiled kernel 906, // cudaArch 16000, // maxSharedMem (intentionally too small) &maxStackSize, 32832 // ncclMaxSharedMem for gfx906 ); EXPECT_EQ(result, ncclSystemError) << "Expected ncclSystemError when ncclMaxSharedMem exceeds maxSharedMem"; } ).withEnvironment({{"NCCL_DEBUG", "WARN"}}) ); } // Test ncclEnqueueCheck function TEST_F(EnqueueTests, ncclEnqueueCheck_ValidInput) { ProcessIsolatedTestRunner::ExecutionOptions options; options.stopOnFirstFailure = false; options.verboseLogging = true; RUN_ISOLATED_TESTS_WITH_OPTIONS( options, ProcessIsolatedTestRunner::TestConfig( "ncclEnqueueCheck_ValidInput", []() { EnqueueTestEnvironment env; env.setup(); ncclResult_t result = ncclEnqueueCheck(env.info); EXPECT_TRUE(result == ncclSuccess); env.cleanup(); } ) ); } TEST_F(EnqueueTests, ncclEnqueueCheck_InvalidComm) { ProcessIsolatedTestRunner::ExecutionOptions options; options.stopOnFirstFailure = false; options.verboseLogging = true; RUN_ISOLATED_TESTS_WITH_OPTIONS( options, ProcessIsolatedTestRunner::TestConfig( "ncclEnqueueCheck_InvalidComm", []() { EnqueueTestEnvironment env; env.setup(); env.info->comm = nullptr; ncclResult_t result = ncclEnqueueCheck(env.info); EXPECT_EQ(result, ncclInvalidArgument); env.cleanup(); } ) ); } TEST_F(EnqueueTests, ncclEnqueueCheck_InvalidBuffers) { ProcessIsolatedTestRunner::ExecutionOptions options; options.stopOnFirstFailure = false; options.verboseLogging = true; RUN_ISOLATED_TESTS_WITH_OPTIONS( options, ProcessIsolatedTestRunner::TestConfig( "ncclEnqueueCheck_InvalidBuffers", []() { EnqueueTestEnvironment env; env.setup(); // Test with null sendbuff env.comm->checkPointers = 1; env.info->sendbuff = nullptr; ncclResult_t result = ncclEnqueueCheck(env.info); EXPECT_EQ(result, ncclInvalidArgument); // Reset sendbuff and test with null recvbuff env.info->sendbuff = env.sendbuff; env.info->recvbuff = nullptr; result = ncclEnqueueCheck(env.info); EXPECT_EQ(result, ncclInvalidArgument); env.cleanup(); } ) ); } // Test ncclFuncSendCount function TEST_F(EnqueueTests, ncclFuncSendCount_AllTests) { ProcessIsolatedTestRunner::ExecutionOptions options; options.stopOnFirstFailure = false; options.verboseLogging = true; RUN_ISOLATED_TESTS_WITH_OPTIONS( options, ProcessIsolatedTestRunner::TestConfig( "ncclFuncSendCount_AllReduce", []() { size_t count = 1000; int nRanks = 4; size_t result = ncclFuncSendCount(ncclFuncAllReduce, nRanks, count); EXPECT_EQ(result, count); } ), ProcessIsolatedTestRunner::TestConfig( "ncclFuncSendCount_Broadcast", []() { size_t count = 1000; int nRanks = 4; size_t result = ncclFuncSendCount(ncclFuncBroadcast, nRanks, count); EXPECT_EQ(result, count); } ), ProcessIsolatedTestRunner::TestConfig( "ncclFuncSendCount_Reduce", []() { size_t count = 1000; int nRanks = 4; size_t result = ncclFuncSendCount(ncclFuncReduce, nRanks, count); EXPECT_EQ(result, count); } ), ProcessIsolatedTestRunner::TestConfig( "ncclFuncSendCount_AllGather", []() { size_t count = 1000; int nRanks = 4; size_t result = ncclFuncSendCount(ncclFuncAllGather, nRanks, count); EXPECT_EQ(result, count); } ), ProcessIsolatedTestRunner::TestConfig( "ncclFuncSendCount_ReduceScatter", []() { size_t count = 1000; int nRanks = 4; size_t result = ncclFuncSendCount(ncclFuncReduceScatter, nRanks, count); EXPECT_EQ(result, count * nRanks); } ), ProcessIsolatedTestRunner::TestConfig( "ncclFuncSendCount_ZeroCount", []() { size_t result = ncclFuncSendCount(ncclFuncAllReduce, 4, 0); EXPECT_EQ(result, 0); } ) ); } // Test ncclFuncRecvCount function TEST_F(EnqueueTests, ncclFuncRecvCount_AllTests) { ProcessIsolatedTestRunner::ExecutionOptions options; options.stopOnFirstFailure = false; options.verboseLogging = true; RUN_ISOLATED_TESTS_WITH_OPTIONS( options, ProcessIsolatedTestRunner::TestConfig( "ncclFuncRecvCount_AllReduce", []() { size_t count = 1000; int nRanks = 4; size_t result = ncclFuncRecvCount(ncclFuncAllReduce, nRanks, count); EXPECT_EQ(result, count); } ), ProcessIsolatedTestRunner::TestConfig( "ncclFuncRecvCount_Broadcast", []() { size_t count = 1000; int nRanks = 4; size_t result = ncclFuncRecvCount(ncclFuncBroadcast, nRanks, count); EXPECT_EQ(result, count); } ), ProcessIsolatedTestRunner::TestConfig( "ncclFuncRecvCount_Reduce", []() { size_t count = 1000; int nRanks = 4; size_t result = ncclFuncRecvCount(ncclFuncReduce, nRanks, count); EXPECT_EQ(result, count); } ), ProcessIsolatedTestRunner::TestConfig( "ncclFuncRecvCount_AllGather", []() { size_t count = 1000; int nRanks = 4; size_t result = ncclFuncRecvCount(ncclFuncAllGather, nRanks, count); EXPECT_EQ(result, count * nRanks); } ), ProcessIsolatedTestRunner::TestConfig( "ncclFuncRecvCount_ReduceScatter", []() { size_t count = 1000; int nRanks = 4; size_t result = ncclFuncRecvCount(ncclFuncReduceScatter, nRanks, count); EXPECT_EQ(result, count); } ), ProcessIsolatedTestRunner::TestConfig( "ncclFuncRecvCount_ZeroCount", []() { size_t result = ncclFuncRecvCount(ncclFuncAllReduce, 4, 0); EXPECT_EQ(result, 0); } ) ); } // Test ncclFuncMaxSendRecvCount function TEST_F(EnqueueTests, ncclFuncMaxSendRecvCount_AllTests) { ProcessIsolatedTestRunner::ExecutionOptions options; options.stopOnFirstFailure = false; options.verboseLogging = true; RUN_ISOLATED_TESTS_WITH_OPTIONS( options, ProcessIsolatedTestRunner::TestConfig( "ncclFuncMaxSendRecvCount_AllReduce", []() { size_t count = 1000; int nRanks = 4; size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllReduce, nRanks, count); EXPECT_EQ(result, count); } ), ProcessIsolatedTestRunner::TestConfig( "ncclFuncMaxSendRecvCount_AllGather", []() { size_t count = 1000; int nRanks = 4; size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllGather, nRanks, count); // For AllGather, receive count (count * nRanks) is larger than send count (count) EXPECT_EQ(result, count * nRanks); } ), ProcessIsolatedTestRunner::TestConfig( "ncclFuncMaxSendRecvCount_ReduceScatter", []() { size_t count = 1000; int nRanks = 4; size_t result = ncclFuncMaxSendRecvCount(ncclFuncReduceScatter, nRanks, count); // For ReduceScatter, send count (count) is larger than receive count (count/nRanks) EXPECT_EQ(result, count * nRanks); } ), ProcessIsolatedTestRunner::TestConfig( "ncclFuncMaxSendRecvCount_ZeroCount", []() { size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllReduce, 4, 0); EXPECT_EQ(result, 0); } ) ); } // Edge case tests TEST_F(EnqueueTests, ncclFuncCounts_EdgeCases) { ProcessIsolatedTestRunner::ExecutionOptions options; options.stopOnFirstFailure = false; options.verboseLogging = true; RUN_ISOLATED_TESTS_WITH_OPTIONS( options, ProcessIsolatedTestRunner::TestConfig( "ncclFuncCounts_SingleRank", []() { size_t count = 1000; int nRanks = 1; // Test with single rank EXPECT_EQ(ncclFuncSendCount(ncclFuncAllReduce, nRanks, count), count); EXPECT_EQ(ncclFuncRecvCount(ncclFuncAllReduce, nRanks, count), count); EXPECT_EQ(ncclFuncMaxSendRecvCount(ncclFuncAllReduce, nRanks, count), count); } ), ProcessIsolatedTestRunner::TestConfig( "ncclFuncCounts_LargeRankCount", []() { size_t count = 1000; int nRanks = 1024; // Test with large number of ranks EXPECT_EQ(ncclFuncSendCount(ncclFuncAllGather, nRanks, count), count); EXPECT_EQ(ncclFuncRecvCount(ncclFuncAllGather, nRanks, count), count * nRanks); EXPECT_EQ( ncclFuncMaxSendRecvCount(ncclFuncAllGather, nRanks, count), count * nRanks ); } ) ); } } // namespace RcclUnitTesting