Added a Process Isolated Test Runner (#1993)

* Added single process isolation support to execute tests

* Address review comments

* Update README

* Removed requirement of explicit call to clear method

* Added macros for simplified usage

* Updated tests to use process isolation framework

* Adjust summary output format for isolated tests

* Updated rccl_wrap tests

* Used process isolation in AllocTests

* Used process isolation and fixed failing tests

* Modified test output, added signal handling

Updated macros to handle lambdas

* Convert argcheck tests to isolated tests

* Convert proxy tests to isolated tests

* Remove non-supported test

* Fixed file descriptor handling and clearing env vars for tests

[ROCm/rccl commit: 7e10267dfd]
Этот коммит содержится в:
Atul Kulkarni
2025-12-08 10:36:05 -06:00
коммит произвёл GitHub
родитель 142860442a
Коммит 11ffeda52f
11 изменённых файлов: 5275 добавлений и 3210 удалений
+149 -112
Просмотреть файл
@@ -4,150 +4,187 @@
* See LICENSE.txt for license information
************************************************************************/
#include <alloc.h>
#include <gtest/gtest.h>
#include <rccl/rccl.h>
#include <alloc.h>
#include "TestBed.hpp"
#include "common/ErrCode.hpp"
#include "common/ProcessIsolatedTestRunner.hpp"
template ncclResult_t ncclCudaMemcpy<float>(float*, float*, size_t);
namespace RcclUnitTesting
{
TEST(Alloc, ncclIbMallocDebugNonZero) {
void* ptr = nullptr;
size_t size = 4096;
TEST(Alloc, ncclIbMallocDebugNonZero)
{
void* ptr = nullptr;
size_t size = 4096;
ncclResult_t result = ncclIbMalloc(&ptr, size);
ncclResult_t result = ncclIbMalloc(&ptr, size);
EXPECT_EQ(result, ncclSuccess);
ASSERT_NE(ptr, nullptr);
EXPECT_EQ(result, ncclSuccess);
ASSERT_NE(ptr, nullptr);
char* char_ptr = static_cast<char*>(ptr);
for (size_t i = 0; i < size; ++i) {
ASSERT_EQ(char_ptr[i], 0);
}
free(ptr);
}
TEST(Alloc, ncclIbMallocDebugZeroSize) {
void* ptr = (void*)0xdeadbeef;
ncclResult_t result = ncclIbMalloc(&ptr, 0);
EXPECT_EQ(result, ncclSuccess);
EXPECT_EQ(ptr, nullptr);
}
TEST(Alloc, ncclCuMemHostAlloc) {
void* ptr = NULL;
void* handle = NULL;
size_t size = 1024;
ncclResult_t result = ncclCuMemHostAlloc(&ptr, handle, size);
ASSERT_EQ(result, ncclInternalError);
}
TEST(Alloc, ncclCuMemHostFree)
char* char_ptr = static_cast<char*>(ptr);
for(size_t i = 0; i < size; ++i)
{
void* dummyPtr = reinterpret_cast<void*>(0x1234); // any dummy address
ncclResult_t result = ncclCuMemHostFree(dummyPtr);
ASSERT_EQ(result, ncclInternalError);
ASSERT_EQ(char_ptr[i], 0);
}
free(ptr);
}
TEST(Alloc, ncclIbMallocDebugZeroSize)
{
void* ptr = (void*)0xdeadbeef;
ncclResult_t result = ncclIbMalloc(&ptr, 0);
EXPECT_EQ(result, ncclSuccess);
EXPECT_EQ(ptr, nullptr);
}
TEST(Alloc, ncclCuMemHostAlloc)
{
void* ptr = NULL;
void* handle = NULL;
size_t size = 1024;
ncclResult_t result = ncclCuMemHostAlloc(&ptr, handle, size);
ASSERT_EQ(result, ncclInternalError);
}
TEST(Alloc, ncclCuMemHostFree)
{
void* dummyPtr = reinterpret_cast<void*>(0x1234); // any dummy address
ncclResult_t result = ncclCuMemHostFree(dummyPtr);
ASSERT_EQ(result, ncclInternalError);
}
#if ROCM_VERSION < 70000
// This test is only valid for ROCm versions < 7.0.0
// In ROCm 7.0.0+, the ncclCuMemAlloc signature changed
TEST(Alloc, ncclCuMemAlloc)
{
void* ptr = reinterpret_cast<void*>(0x1234); // dummy non-null input
void* handle = reinterpret_cast<void*>(0x5678); // dummy non-null input
size_t size = 1024;
hipMemAllocationHandleType type = hipMemHandleTypeNone;
ncclResult_t result = ncclCuMemAlloc(&ptr, &handle, type, size);
EXPECT_EQ(result, ncclInternalError);
}
// This test is only valid for ROCm versions < 7.0.0
// In ROCm 7.0.0+, the ncclCuMemAlloc signature changed
TEST(Alloc, ncclCuMemAlloc)
{
void* ptr = reinterpret_cast<void*>(0x1234); // dummy non-null input
void* handle = reinterpret_cast<void*>(0x5678); // dummy non-null input
size_t size = 1024;
hipMemAllocationHandleType type = hipMemHandleTypeNone;
ncclResult_t result = ncclCuMemAlloc(&ptr, &handle, type, size);
EXPECT_EQ(result, ncclInternalError);
}
TEST(Alloc, ncclCuMemFree)
{
void* dummyPtr = reinterpret_cast<void*>(0xdeadbeef); // arbitrary non-null
ncclResult_t result = ncclCuMemFree(dummyPtr);
EXPECT_EQ(result, ncclInternalError);
}
TEST(Alloc, ncclCuMemFree)
{
void* dummyPtr = reinterpret_cast<void*>(0xdeadbeef); // arbitrary non-null
ncclResult_t result = ncclCuMemFree(dummyPtr);
EXPECT_EQ(result, ncclInternalError);
}
TEST(Alloc, ncclCuMemAllocAddr)
{
void* ptr = reinterpret_cast<void*>(0x1111); // Dummy non-null input
hipMemGenericAllocationHandle_t handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(0x1234);
size_t size = 4096;
ncclResult_t result = ncclCuMemAllocAddr(&ptr, &handle, size);
ASSERT_EQ(result, ncclInternalError);
}
TEST(Alloc, ncclCuMemAllocAddr)
{
void* ptr = reinterpret_cast<void*>(0x1111); // Dummy non-null input
hipMemGenericAllocationHandle_t handle
= reinterpret_cast<hipMemGenericAllocationHandle_t>(0x1234);
size_t size = 4096;
ncclResult_t result = ncclCuMemAllocAddr(&ptr, &handle, size);
ASSERT_EQ(result, ncclInternalError);
}
TEST(Alloc, ncclCuMemFreeAddr)
{
void* testPtr = reinterpret_cast<void*>(0xbeefcafe); // Arbitrary non-null pointer
ncclResult_t result = ncclCuMemFreeAddr(testPtr);
ASSERT_EQ(result, ncclInternalError);
}
TEST(Alloc, ncclCuMemFreeAddr)
{
void* testPtr = reinterpret_cast<void*>(0xbeefcafe); // Arbitrary non-null pointer
ncclResult_t result = ncclCuMemFreeAddr(testPtr);
ASSERT_EQ(result, ncclInternalError);
}
#endif // ROCM_VERSION < 70000
TEST(Alloc, NcclCudaMemcpy) {
constexpr size_t N = 128;
float *d_src = nullptr, *d_dst = nullptr;
float h_src[N], h_dst[N];
TEST(Alloc, NcclCudaMemcpy)
{
RUN_ISOLATED_TEST(
"NcclCudaMemcpy",
[]()
{
constexpr size_t N = 128;
float * d_src = nullptr, *d_dst = nullptr;
float h_src[N], h_dst[N];
for (size_t i = 0; i < N; ++i) h_src[i] = static_cast<float>(i + 1);
// Allocate device memory
for(size_t i = 0; i < N; ++i)
h_src[i] = static_cast<float>(i + 1);
// Allocate device memory
ASSERT_EQ(hipMalloc(&d_src, N * sizeof(float)), hipSuccess);
ASSERT_EQ(hipMalloc(&d_dst, N * sizeof(float)), hipSuccess);
ASSERT_EQ(hipMalloc(&d_src, N * sizeof(float)), hipSuccess);
ASSERT_EQ(hipMalloc(&d_dst, N * sizeof(float)), hipSuccess);
// Copy from host to device (source buffer)
ASSERT_EQ(hipMemcpy(d_src, h_src, N * sizeof(float), hipMemcpyHostToDevice), hipSuccess);
// Copy from host to device (source buffer)
ASSERT_EQ(
hipMemcpy(d_src, h_src, N * sizeof(float), hipMemcpyHostToDevice),
hipSuccess
);
// Perform the tested function
ncclResult_t result = ncclCudaMemcpy<float>(d_dst, d_src, N);
// Perform the tested function
ncclResult_t result = ncclCudaMemcpy<float>(d_dst, d_src, N);
ASSERT_EQ(result, ncclSuccess); // Fixed typo: was ncclSsuccess
ASSERT_EQ(result, ncclSuccess);
// Copy result back to host
ASSERT_EQ(hipMemcpy(h_dst, d_dst, N * sizeof(float), hipMemcpyDeviceToHost), hipSuccess);
// Copy result back to host
ASSERT_EQ(
hipMemcpy(h_dst, d_dst, N * sizeof(float), hipMemcpyDeviceToHost),
hipSuccess
);
// Check correctness
for (size_t i = 0; i < N; ++i) {
EXPECT_EQ(h_src[i], h_dst[i]) << "Mismatch at index " << i;
// Check correctness
for(size_t i = 0; i < N; ++i)
{
EXPECT_EQ(h_src[i], h_dst[i]) << "Mismatch at index " << i;
}
// Free memory
hipFree(d_src);
hipFree(d_dst);
}
// Free memory
hipFree(d_src);
hipFree(d_dst);
);
}
}
TEST(Alloc, ZeroElementMemcpy)
{
RUN_ISOLATED_TEST(
"ZeroElementMemcpy",
[]()
{
float *d_src = nullptr, *d_dst = nullptr;
ASSERT_EQ(hipMalloc(&d_src, sizeof(float)), hipSuccess);
ASSERT_EQ(hipMalloc(&d_dst, sizeof(float)), hipSuccess);
TEST(Alloc, ZeroElementMemcpy) {
float *d_src = nullptr, *d_dst = nullptr;
ASSERT_EQ(hipMalloc(&d_src, sizeof(float)), hipSuccess);
ASSERT_EQ(hipMalloc(&d_dst, sizeof(float)), hipSuccess);
ncclResult_t result = ncclCudaMemcpy<float>(d_dst, d_src, 0);
EXPECT_EQ(result, ncclSuccess) << "Zero-element copy should succeed (no-op)";
ncclResult_t result = ncclCudaMemcpy<float>(d_dst, d_src, 0);
EXPECT_EQ(result, ncclSuccess) << "Zero-element copy should succeed (no-op)";
hipFree(d_src);
hipFree(d_dst);
}
);
}
hipFree(d_src);
hipFree(d_dst);
}
TEST(Alloc, MemcpyNullSrcOrDstPointer)
{
RUN_ISOLATED_TEST(
"MemcpyNullSrcOrDstPointer",
[]()
{
constexpr size_t N = 16;
float* d_valid = nullptr;
ASSERT_EQ(hipMalloc(&d_valid, N * sizeof(float)), hipSuccess);
TEST(Alloc, MemcpyNullSrcOrDstPointer) {
constexpr size_t N = 16;
float* d_valid = nullptr;
ASSERT_EQ(hipMalloc(&d_valid, N * sizeof(float)), hipSuccess);
// Case 1: src is nullptr
ncclResult_t result = ncclCudaMemcpy<float>(d_valid, nullptr, N);
EXPECT_EQ(result, ncclUnhandledCudaError)
<< "Expected ncclUnhandledCudaError when src is nullptr";
// Case 1: src is nullptr
ncclResult_t result = ncclCudaMemcpy<float>(d_valid, nullptr, N);
EXPECT_EQ(result, ncclUnhandledCudaError) << "Expected ncclUnhandledCudaError when src is nullptr";
// Case 2: dst is nullptr
result = ncclCudaMemcpy<float>(nullptr, d_valid, N);
EXPECT_EQ(result, ncclUnhandledCudaError)
<< "Expected ncclUnhandledCudaError when dst is nullptr";
// Case 2: dst is nullptr
result = ncclCudaMemcpy<float>(nullptr, d_valid, N);
EXPECT_EQ(result, ncclUnhandledCudaError) << "Expected ncclUnhandledCudaError when dst is nullptr";
hipFree(d_valid);
}
} //namespace rccl
hipFree(d_valid);
}
);
}
} // namespace RcclUnitTesting
+537 -235
Просмотреть файл
@@ -4,324 +4,626 @@
* See LICENSE.txt for license information
************************************************************************/
#include <gtest/gtest.h>
#include <hip/hip_runtime.h>
#include "argcheck.h"
#include "comm.h"
#include <hip/hip_runtime.h>
#include "common/ErrCode.hpp"
#include "common/ProcessIsolatedTestRunner.hpp"
class ArgCheckTest : public ::testing::Test {
protected:
ncclComm_t comm;
struct ncclInfo *info;
int *sendDevicePtr = nullptr;
int *recvDevicePtr = nullptr;
// Helper struct for ArgCheck tests (NOT a fixture - used inside isolated tests)
struct ArgCheckTestEnvironment
{
ncclComm_t comm;
struct ncclInfo* info;
int* sendDevicePtr = nullptr;
int* recvDevicePtr = nullptr;
// Helper function to set up valid ncclInfo for boundary testing
void SetupValidInfo() {
// Set up valid info structure
info->comm = comm;
info->root = 0; // Valid root
info->datatype = (ncclDataType_t)0; // Valid datatype
info->op = (ncclRedOp_t)0; // Valid reduction operation
info->coll = ncclFuncBroadcast; // Valid collective operation
info->sendbuff = nullptr; // Will be set per test if needed
info->recvbuff = nullptr; // Will be set per test if needed
info->count = 10; // Valid count
info->opName = "TestOp"; // Valid operation name
}
// Helper function for tests requiring device memory
void SetupValidBufferWithDeviceMemory() {
// Set the active device to match comm->cudaDev
hipError_t errSetDevice = hipSetDevice(comm->cudaDev);
ASSERT_EQ(errSetDevice, hipSuccess);
// Allocate device memory
hipError_t errSend = hipMalloc(&sendDevicePtr, sizeof(int));
ASSERT_EQ(errSend, hipSuccess);
hipError_t errRecv = hipMalloc(&recvDevicePtr, sizeof(int));
ASSERT_EQ(errRecv, hipSuccess);
// Set device pointers
info->sendbuff = sendDevicePtr;
info->recvbuff = recvDevicePtr;
}
// Helper to clean up device memory
void CleanupDeviceMemory() {
if (sendDevicePtr) {
hipFree(sendDevicePtr);
sendDevicePtr = nullptr;
// Helper function to set up valid ncclInfo for boundary testing
void SetupValidInfo()
{
// Set up valid info structure
info->comm = comm;
info->root = 0; // Valid root
info->datatype = (ncclDataType_t)0; // Valid datatype
info->op = (ncclRedOp_t)0; // Valid reduction operation
info->coll = ncclFuncBroadcast; // Valid collective operation
info->sendbuff = nullptr; // Will be set per test if needed
info->recvbuff = nullptr; // Will be set per test if needed
info->count = 10; // Valid count
info->opName = "TestOp"; // Valid operation name
}
if (recvDevicePtr) {
hipFree(recvDevicePtr);
recvDevicePtr = nullptr;
// Helper function for tests requiring device memory
void SetupValidBufferWithDeviceMemory()
{
// Set the active device to match comm->cudaDev
hipError_t errSetDevice = hipSetDevice(comm->cudaDev);
ASSERT_EQ(errSetDevice, hipSuccess);
// Allocate device memory
hipError_t errSend = hipMalloc(&sendDevicePtr, sizeof(int));
ASSERT_EQ(errSend, hipSuccess);
hipError_t errRecv = hipMalloc(&recvDevicePtr, sizeof(int));
ASSERT_EQ(errRecv, hipSuccess);
// Set device pointers
info->sendbuff = sendDevicePtr;
info->recvbuff = recvDevicePtr;
}
}
void SetUp() override {
// Allocate and zero-initialize ncclComm as a pointer
comm = (struct ncclComm *)calloc(1, sizeof(struct ncclComm));
ASSERT_NE(comm, nullptr) << "Failed to allocate ncclComm";
// Initialize the communicator with required fields
comm->cudaDev = 0;
comm->nRanks = 4;
comm->checkPointers = true;
comm->rank = 0;
comm->startMagic = NCCL_MAGIC;
comm->endMagic = NCCL_MAGIC;
// Verify the magic values were set correctly
ASSERT_EQ(comm->startMagic, NCCL_MAGIC) << "startMagic not set correctly";
ASSERT_EQ(comm->endMagic, NCCL_MAGIC) << "endMagic not set correctly";
// Allocate and zero-initialize ncclInfo as a pointer
info = (ncclInfo *)calloc(1, sizeof(ncclInfo));
ASSERT_NE(info, nullptr) << "Failed to allocate ncclInfo";
SetupValidInfo();
SetupValidBufferWithDeviceMemory();
}
void TearDown() override {
// Free the allocated memory
CleanupDeviceMemory();
if (info) {
free(info);
info = nullptr;
// Helper to clean up device memory
void CleanupDeviceMemory()
{
if(sendDevicePtr)
{
hipFree(sendDevicePtr);
sendDevicePtr = nullptr;
}
if(recvDevicePtr)
{
hipFree(recvDevicePtr);
recvDevicePtr = nullptr;
}
}
if (comm) {
free(comm);
comm = nullptr;
void setup()
{
// Allocate and zero-initialize ncclComm as a pointer
comm = (struct ncclComm*)calloc(1, sizeof(struct ncclComm));
ASSERT_NE(comm, nullptr) << "Failed to allocate ncclComm";
// Initialize the communicator with required fields
comm->cudaDev = 0;
comm->nRanks = 4;
comm->checkPointers = true;
comm->rank = 0;
comm->startMagic = NCCL_MAGIC;
comm->endMagic = NCCL_MAGIC;
// Verify the magic values were set correctly
ASSERT_EQ(comm->startMagic, NCCL_MAGIC) << "startMagic not set correctly";
ASSERT_EQ(comm->endMagic, NCCL_MAGIC) << "endMagic not set correctly";
// Allocate and zero-initialize ncclInfo as a pointer
info = (ncclInfo*)calloc(1, sizeof(ncclInfo));
ASSERT_NE(info, nullptr) << "Failed to allocate ncclInfo";
SetupValidInfo();
SetupValidBufferWithDeviceMemory();
}
void cleanup()
{
// Free the allocated memory
CleanupDeviceMemory();
if(info)
{
free(info);
info = nullptr;
}
if(comm)
{
free(comm);
comm = nullptr;
}
}
}
};
TEST_F(ArgCheckTest, CudaPtrCheck_ValidPointer) {
int *devicePtr = nullptr;
hipError_t err = hipMalloc(&devicePtr, sizeof(int));
ASSERT_EQ(err, hipSuccess);
TEST(ArgCheckTest, CudaPtrCheck_ValidPointer)
{
RUN_ISOLATED_TEST(
"CudaPtrCheck_ValidPointer",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
ncclResult_t result = CudaPtrCheck(devicePtr, comm, "devicePtr", "TestOp");
EXPECT_EQ(result, ncclSuccess);
int* devicePtr = nullptr;
hipError_t err = hipMalloc(&devicePtr, sizeof(int));
ASSERT_EQ(err, hipSuccess);
hipFree(devicePtr);
ncclResult_t result = CudaPtrCheck(devicePtr, env.comm, "devicePtr", "TestOp");
EXPECT_EQ(result, ncclSuccess);
hipFree(devicePtr);
env.cleanup();
INFO("Test 'CudaPtrCheck_ValidPointer' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, CudaPtrCheck_NullPointer) {
ncclResult_t result = CudaPtrCheck(nullptr, comm, "invalidPtr", "TestOp");
EXPECT_EQ(result, ncclInvalidArgument);
TEST(ArgCheckTest, CudaPtrCheck_NullPointer)
{
RUN_ISOLATED_TEST(
"CudaPtrCheck_NullPointer",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
ncclResult_t result = CudaPtrCheck(nullptr, env.comm, "invalidPtr", "TestOp");
EXPECT_EQ(result, ncclInvalidArgument);
env.cleanup();
INFO("Test 'CudaPtrCheck_NullPointer' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, CudaPtrCheck_DifferentDevicePointer) {
int *devicePtr = nullptr;
hipSetDevice(1);
hipError_t err = hipMalloc(&devicePtr, sizeof(int));
ASSERT_EQ(err, hipSuccess);
TEST(ArgCheckTest, CudaPtrCheck_DifferentDevicePointer)
{
RUN_ISOLATED_TEST(
"CudaPtrCheck_DifferentDevicePointer",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
ncclResult_t result = CudaPtrCheck(devicePtr, comm, "devicePtr", "TestOp");
EXPECT_EQ(result, ncclInvalidArgument);
int* devicePtr = nullptr;
hipSetDevice(1);
hipError_t err = hipMalloc(&devicePtr, sizeof(int));
ASSERT_EQ(err, hipSuccess);
hipFree(devicePtr);
hipSetDevice(comm->cudaDev);
ncclResult_t result = CudaPtrCheck(devicePtr, env.comm, "devicePtr", "TestOp");
EXPECT_EQ(result, ncclInvalidArgument);
hipFree(devicePtr);
hipSetDevice(env.comm->cudaDev);
env.cleanup();
INFO("Test 'CudaPtrCheck_DifferentDevicePointer' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, CudaPtrCheck_HostMemoryPointer) {
// Test with host memory instead of device memory
int *hostPtr = (int *)malloc(sizeof(int));
ASSERT_NE(hostPtr, nullptr) << "Failed to allocate host memory";
TEST(ArgCheckTest, CudaPtrCheck_HostMemoryPointer)
{
RUN_ISOLATED_TEST(
"CudaPtrCheck_HostMemoryPointer",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
*hostPtr = 42; // Initialize the memory
// Test with host memory instead of device memory
int* hostPtr = (int*)malloc(sizeof(int));
ASSERT_NE(hostPtr, nullptr) << "Failed to allocate host memory";
// This should fail because host memory is not device memory
ncclResult_t result = CudaPtrCheck(hostPtr, comm, "hostPtr", "TestOp");
*hostPtr = 42; // Initialize the memory
// Host memory should be rejected by CudaPtrCheck
EXPECT_EQ(result, ncclInvalidArgument)
<< "Host memory should be rejected by CudaPtrCheck";
// This should fail because host memory is not device memory
ncclResult_t result = CudaPtrCheck(hostPtr, env.comm, "hostPtr", "TestOp");
free(hostPtr);
// Host memory should be rejected by CudaPtrCheck
EXPECT_EQ(result, ncclInvalidArgument)
<< "Host memory should be rejected by CudaPtrCheck";
free(hostPtr);
env.cleanup();
INFO("Test 'CudaPtrCheck_HostMemoryPointer' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, PtrCheck_ValidPointer) {
int value = 42;
ncclResult_t result = PtrCheck(&value, "TestOp", "value");
ASSERT_EQ(result, ncclSuccess);
TEST(ArgCheckTest, PtrCheck_ValidPointer)
{
RUN_ISOLATED_TEST(
"PtrCheck_ValidPointer",
[]()
{
int value = 42;
ncclResult_t result = PtrCheck(&value, "TestOp", "value");
ASSERT_EQ(result, ncclSuccess);
INFO("Test 'PtrCheck_ValidPointer' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, PtrCheck_NullPointer) {
ncclResult_t result = PtrCheck(nullptr, "TestOp", "value");
ASSERT_EQ(result, ncclInvalidArgument);
TEST(ArgCheckTest, PtrCheck_NullPointer)
{
RUN_ISOLATED_TEST(
"PtrCheck_NullPointer",
[]()
{
ncclResult_t result = PtrCheck(nullptr, "TestOp", "value");
ASSERT_EQ(result, ncclInvalidArgument);
INFO("Test 'PtrCheck_NullPointer' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, CommCheck_ValidComm) {
comm->startMagic = NCCL_MAGIC;
comm->endMagic = NCCL_MAGIC;
TEST(ArgCheckTest, CommCheck_ValidComm)
{
RUN_ISOLATED_TEST(
"CommCheck_ValidComm",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
// Verify magic values are still correct (should be set in SetUp())
ASSERT_EQ(comm->startMagic, NCCL_MAGIC) << "startMagic was corrupted";
ASSERT_EQ(comm->endMagic, NCCL_MAGIC) << "endMagic was corrupted";
env.comm->startMagic = NCCL_MAGIC;
env.comm->endMagic = NCCL_MAGIC;
// Call CommCheck and verify the result
ncclResult_t result = CommCheck(comm, "TestOp", "testComm");
EXPECT_EQ(result, ncclSuccess) << "Failed for valid communicator";
// Verify magic values are still correct (should be set in setup())
ASSERT_EQ(env.comm->startMagic, NCCL_MAGIC) << "startMagic was corrupted";
ASSERT_EQ(env.comm->endMagic, NCCL_MAGIC) << "endMagic was corrupted";
// Call CommCheck and verify the result
ncclResult_t result = CommCheck(env.comm, "TestOp", "testComm");
EXPECT_EQ(result, ncclSuccess) << "Failed for valid communicator";
env.cleanup();
INFO("Test 'CommCheck_ValidComm' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, CommCheck_NullComm) {
ncclResult_t result = CommCheck(nullptr, "TestOp", "comm");
ASSERT_EQ(result, ncclInvalidArgument);
TEST(ArgCheckTest, CommCheck_NullComm)
{
RUN_ISOLATED_TEST(
"CommCheck_NullComm",
[]()
{
ncclResult_t result = CommCheck(nullptr, "TestOp", "comm");
ASSERT_EQ(result, ncclInvalidArgument);
INFO("Test 'CommCheck_NullComm' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, CommCheck_CorruptedStartMagic) {
// Corrupt only startMagic, keep endMagic valid
comm->startMagic = 1; // Corrupt startMagic
comm->endMagic = NCCL_MAGIC; // Keep endMagic valid
TEST(ArgCheckTest, CommCheck_CorruptedStartMagic)
{
RUN_ISOLATED_TEST(
"CommCheck_CorruptedStartMagic",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
// Call CommCheck and verify the result
ncclResult_t result = CommCheck(comm, "TestOp", "comm");
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted startMagic";
// Corrupt only startMagic, keep endMagic valid
env.comm->startMagic = 1; // Corrupt startMagic
env.comm->endMagic = NCCL_MAGIC; // Keep endMagic valid
// Call CommCheck and verify the result
ncclResult_t result = CommCheck(env.comm, "TestOp", "comm");
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted startMagic";
env.cleanup();
INFO("Test 'CommCheck_CorruptedStartMagic' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, CommCheck_CorruptedEndMagic) {
// Keep startMagic valid, corrupt only endMagic
comm->startMagic = NCCL_MAGIC; // Keep startMagic valid
comm->endMagic = 1; // Corrupt endMagic
TEST(ArgCheckTest, CommCheck_CorruptedEndMagic)
{
RUN_ISOLATED_TEST(
"CommCheck_CorruptedEndMagic",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
// Call CommCheck and verify the result
ncclResult_t result = CommCheck(comm, "TestOp", "comm");
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted endMagic";
// Keep startMagic valid, corrupt only endMagic
env.comm->startMagic = NCCL_MAGIC; // Keep startMagic valid
env.comm->endMagic = 1; // Corrupt endMagic
// Call CommCheck and verify the result
ncclResult_t result = CommCheck(env.comm, "TestOp", "comm");
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted endMagic";
env.cleanup();
INFO("Test 'CommCheck_CorruptedEndMagic' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, CommCheck_CorruptedBothMagics) {
// Corrupt both startMagic and endMagic
comm->startMagic = 1; // Corrupt startMagic
comm->endMagic = 1; // Corrupt endMagic
TEST(ArgCheckTest, CommCheck_CorruptedBothMagics)
{
RUN_ISOLATED_TEST(
"CommCheck_CorruptedBothMagics",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
// Call CommCheck and verify the result
ncclResult_t result = CommCheck(comm, "TestOp", "comm");
EXPECT_EQ(result, ncclInvalidArgument)
<< "Failed for corrupted both magic values";
// Corrupt both startMagic and endMagic
env.comm->startMagic = 1; // Corrupt startMagic
env.comm->endMagic = 1; // Corrupt endMagic
// Call CommCheck and verify the result
ncclResult_t result = CommCheck(env.comm, "TestOp", "comm");
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted both magic values";
env.cleanup();
INFO("Test 'CommCheck_CorruptedBothMagics' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, ArgsCheck_InvalidRoot_NegativeValue) {
info->root = -1; // Invalid root (< 0)
TEST(ArgCheckTest, ArgsCheck_InvalidRoot_NegativeValue)
{
RUN_ISOLATED_TEST(
"ArgsCheck_InvalidRoot_NegativeValue",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid root < 0";
env.info->root = -1; // Invalid root (< 0)
ncclResult_t result = ArgsCheck(env.info);
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid root < 0";
env.cleanup();
INFO("Test 'ArgsCheck_InvalidRoot_NegativeValue' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, ArgsCheck_InvalidRoot_ExceedsNRanks) {
info->root = comm->nRanks; // Invalid root (>= nRanks)
TEST(ArgCheckTest, ArgsCheck_InvalidRoot_ExceedsNRanks)
{
RUN_ISOLATED_TEST(
"ArgsCheck_InvalidRoot_ExceedsNRanks",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid root >= nRanks";
env.info->root = env.comm->nRanks; // Invalid root (>= nRanks)
ncclResult_t result = ArgsCheck(env.info);
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid root >= nRanks";
env.cleanup();
INFO("Test 'ArgsCheck_InvalidRoot_ExceedsNRanks' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, ArgsCheck_InvalidDatatype_NegativeValue) {
info->datatype = (ncclDataType_t)-1; // Invalid datatype (< 0)
TEST(ArgCheckTest, ArgsCheck_InvalidDatatype_NegativeValue)
{
RUN_ISOLATED_TEST(
"ArgsCheck_InvalidDatatype_NegativeValue",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid datatype < 0";
env.info->datatype = (ncclDataType_t)-1; // Invalid datatype (< 0)
ncclResult_t result = ArgsCheck(env.info);
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid datatype < 0";
env.cleanup();
INFO("Test 'ArgsCheck_InvalidDatatype_NegativeValue' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, ArgsCheck_InvalidDatatype_ExceedsMaxValue) {
info->datatype =
(ncclDataType_t)ncclNumTypes; // Invalid datatype (>= ncclNumTypes)
TEST(ArgCheckTest, ArgsCheck_InvalidDatatype_ExceedsMaxValue)
{
RUN_ISOLATED_TEST(
"ArgsCheck_InvalidDatatype_ExceedsMaxValue",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument)
<< "Failed for invalid datatype >= ncclNumTypes";
env.info->datatype = (ncclDataType_t)ncclNumTypes; // Invalid datatype (>= ncclNumTypes)
ncclResult_t result = ArgsCheck(env.info);
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid datatype >= ncclNumTypes";
env.cleanup();
INFO("Test 'ArgsCheck_InvalidDatatype_ExceedsMaxValue' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, ArgsCheck_InvalidReductionOperation_NegativeValue) {
info->op = (ncclRedOp_t)-1; // Invalid reduction operation (< 0)
TEST(ArgCheckTest, ArgsCheck_InvalidReductionOperation_NegativeValue)
{
RUN_ISOLATED_TEST(
"ArgsCheck_InvalidReductionOperation_NegativeValue",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument)
<< "Failed for invalid reduction operation < 0";
env.info->op = (ncclRedOp_t)-1; // Invalid reduction operation (< 0)
ncclResult_t result = ArgsCheck(env.info);
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid reduction operation < 0";
env.cleanup();
INFO("Test 'ArgsCheck_InvalidReductionOperation_NegativeValue' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, ArgsCheck_InvalidReductionOperation_ExceedsMaxValue) {
info->op =
(ncclRedOp_t)ncclNumOps; // Invalid reduction operation (>= ncclNumOps)
TEST(ArgCheckTest, ArgsCheck_InvalidReductionOperation_ExceedsMaxValue)
{
RUN_ISOLATED_TEST(
"ArgsCheck_InvalidReductionOperation_ExceedsMaxValue",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument)
<< "Failed for invalid reduction operation >= ncclNumOps";
env.info->op = (ncclRedOp_t)ncclNumOps; // Invalid reduction operation (>= ncclNumOps)
ncclResult_t result = ArgsCheck(env.info);
EXPECT_EQ(result, ncclInvalidArgument)
<< "Failed for invalid reduction operation >= ncclNumOps";
env.cleanup();
INFO("Test 'ArgsCheck_InvalidReductionOperation_ExceedsMaxValue' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, ArgsCheck_InvalidCommunicatorPointers) {
info->op = (ncclRedOp_t)0; // Valid reduction operation
if (info->sendbuff) {
hipFree((void *)info->sendbuff);
info->sendbuff = nullptr; // Invalid send buffer
}
if (info->recvbuff) {
hipFree((void *)info->recvbuff);
info->recvbuff = nullptr; // Invalid receive buffer
}
TEST(ArgCheckTest, ArgsCheck_InvalidCommunicatorPointers)
{
RUN_ISOLATED_TEST(
"ArgsCheck_InvalidCommunicatorPointers",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument)
<< "Failed for invalid communicator pointers";
env.info->op = (ncclRedOp_t)0; // Valid reduction operation
if(env.info->sendbuff)
{
hipFree((void*)env.info->sendbuff);
env.info->sendbuff = nullptr; // Invalid send buffer
}
if(env.info->recvbuff)
{
hipFree((void*)env.info->recvbuff);
env.info->recvbuff = nullptr; // Invalid receive buffer
}
ncclResult_t result = ArgsCheck(env.info);
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid communicator pointers";
env.cleanup();
INFO("Test 'ArgsCheck_InvalidCommunicatorPointers' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, ArgsCheck_InvalidReductionOperationOutOfRange) {
info->op = (ncclRedOp_t)5; // Invalid reduction operation (out of range)
TEST(ArgCheckTest, ArgsCheck_InvalidReductionOperationOutOfRange)
{
RUN_ISOLATED_TEST(
"ArgsCheck_InvalidReductionOperationOutOfRange",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument)
<< "Failed for invalid reduction operation";
env.info->op = (ncclRedOp_t)5; // Invalid reduction operation (out of range)
ncclResult_t result = ArgsCheck(env.info);
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid reduction operation";
env.cleanup();
INFO("Test 'ArgsCheck_InvalidReductionOperationOutOfRange' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, ArgsCheck_UserDefinedReductionOperationInvalid) {
// Test case: User-defined reduction operation with freeNext != -1
info->op = (ncclRedOp_t)(ncclNumOps +
1); // Set op to a user-defined reduction operation
TEST(ArgCheckTest, ArgsCheck_UserDefinedReductionOperationInvalid)
{
RUN_ISOLATED_TEST(
"ArgsCheck_UserDefinedReductionOperationInvalid",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclInvalidArgument)
<< "Failed for user-defined reduction operation with freeNext != -1";
// Test case: User-defined reduction operation with freeNext != -1
env.info->op
= (ncclRedOp_t)(ncclNumOps + 1); // Set op to a user-defined reduction operation
ncclResult_t result = ArgsCheck(env.info);
EXPECT_EQ(result, ncclInvalidArgument)
<< "Failed for user-defined reduction operation with freeNext != -1";
env.cleanup();
INFO("Test 'ArgsCheck_UserDefinedReductionOperationInvalid' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, ArgsCheck_SendAndRecvFunction) {
info->recvbuff =
recvDevicePtr; // Use allocated device pointer for receive buffer
TEST(ArgCheckTest, ArgsCheck_SendAndRecvFunction)
{
RUN_ISOLATED_TEST(
"ArgsCheck_SendAndRecvFunction",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
// Test both ncclFuncSend and ncclFuncRecv
for (auto coll : {ncclFuncSend, ncclFuncRecv}) {
info->coll = coll; // Set the collective operation
env.info->recvbuff
= env.recvDevicePtr; // Use allocated device pointer for receive buffer
// Call ArgsCheck and verify the result
ncclResult_t result = ArgsCheck(info);
ASSERT_EQ(result, ncclSuccess) << "Failed for coll = " << coll;
}
// Test both ncclFuncSend and ncclFuncRecv
for(auto coll : {ncclFuncSend, ncclFuncRecv})
{
env.info->coll = coll; // Set the collective operation
// Call ArgsCheck and verify the result
ncclResult_t result = ArgsCheck(env.info);
ASSERT_EQ(result, ncclSuccess) << "Failed for coll = " << coll;
}
env.cleanup();
INFO("Test 'ArgsCheck_SendAndRecvFunction' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, ArgsCheck_CollNotReduce) {
// Case: info->coll != ncclFuncReduce
info->coll = ncclFuncBroadcast; // Set coll to ncclFuncBroadcast
TEST(ArgCheckTest, ArgsCheck_CollNotReduce)
{
RUN_ISOLATED_TEST(
"ArgsCheck_CollNotReduce",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclSuccess) << "Failed for coll != ncclFuncReduce";
// Case: env.info->coll != ncclFuncReduce
env.info->coll = ncclFuncBroadcast; // Set coll to ncclFuncBroadcast
ncclResult_t result = ArgsCheck(env.info);
EXPECT_EQ(result, ncclSuccess) << "Failed for coll != ncclFuncReduce";
env.cleanup();
INFO("Test 'ArgsCheck_CollNotReduce' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, ArgsCheck_ReduceCollWithRootRank) {
// Case: info->coll == ncclFuncReduce and info->comm->rank == info->root
info->coll = ncclFuncReduce; // Set coll to ncclFuncReduce
TEST(ArgCheckTest, ArgsCheck_ReduceCollWithRootRank)
{
RUN_ISOLATED_TEST(
"ArgsCheck_ReduceCollWithRootRank",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclSuccess)
<< "Failed for coll == ncclFuncReduce and rank == root";
// Case: env.info->coll == ncclFuncReduce and env.info->env.comm->rank == env.info->root
env.info->coll = ncclFuncReduce; // Set coll to ncclFuncReduce
ncclResult_t result = ArgsCheck(env.info);
EXPECT_EQ(result, ncclSuccess) << "Failed for coll == ncclFuncReduce and rank == root";
env.cleanup();
INFO("Test 'ArgsCheck_ReduceCollWithRootRank' PASSED\n");
}
);
}
TEST_F(ArgCheckTest, ArgsCheck_ReduceCollWithNonRootRank) {
comm->rank = 1; // Set rank to 1 (non-root)
TEST(ArgCheckTest, ArgsCheck_ReduceCollWithNonRootRank)
{
RUN_ISOLATED_TEST(
"ArgsCheck_ReduceCollWithNonRootRank",
[]()
{
ArgCheckTestEnvironment env;
env.setup();
ncclResult_t result = ArgsCheck(info);
EXPECT_EQ(result, ncclSuccess)
<< "Failed for coll == ncclFuncReduce and rank != root";
env.comm->rank = 1; // Set rank to 1 (non-root)
ncclResult_t result = ArgsCheck(env.info);
EXPECT_EQ(result, ncclSuccess) << "Failed for coll == ncclFuncReduce and rank != root";
env.cleanup();
INFO("Test 'ArgsCheck_ReduceCollWithNonRootRank' PASSED\n");
}
);
}
+1
Просмотреть файл
@@ -207,6 +207,7 @@ if(BUILD_TESTS)
TransportTests.cpp
common/main_fixtures.cpp
common/EnvVars.cpp
common/ProcessIsolatedTestRunner.cpp
graph/XmlTests.cpp
)
+593 -198
Просмотреть файл
@@ -4,27 +4,120 @@
* See LICENSE.txt for license information
************************************************************************/
#include <gtest/gtest.h>
#include <cstring>
#include <hip/hip_runtime.h>
#include <cstring>
#include "comm.h"
#include "info.h"
#include "common/ProcessIsolatedTestRunner.hpp"
#include "enqueue.h"
#include "info.h"
#include "utils.h"
class EnqueueTests : public ::testing::Test {
protected:
namespace RcclUnitTesting
{
// Simple test kernel for validating ncclInitKernelsForDevice
__global__ void simpleTestKernel(int* data)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(data)
data[tid] = tid;
}
// Helper function to test ncclInitKernelsForDevice with a real kernel
ncclResult_t testKernelAttributes(void* kernelFn, size_t* maxStackSize)
{
if(!kernelFn || !maxStackSize)
return ncclInvalidArgument;
*maxStackSize = 0;
hipFuncAttributes attr = {0};
hipError_t errcode = hipFuncGetAttributes(&attr, kernelFn);
if(errcode != hipSuccess)
return ncclSystemError;
*maxStackSize = attr.localSizeBytes;
return ncclSuccess; // ncclSuccess
}
// Helper function to test shared memory limit checking with a real kernel
// ncclMaxSharedMem: For gfx906 (cudaArch 906) with WarpSize 64, this is typically 32832 bytes
ncclResult_t testKernelSharedMemoryLimit(
void* kernelFn, int cudaArch, int maxSharedMem, size_t* maxStackSize, int ncclMaxSharedMem
)
{
if(!kernelFn)
return ncclInvalidArgument;
ncclResult_t result = ncclSuccess;
if(maxStackSize)
*maxStackSize = 0;
hipFuncAttributes attr = {0};
hipError_t errcode = hipFuncGetAttributes(&attr, kernelFn);
if(errcode != hipSuccess)
{
return ncclSystemError;
}
if(maxStackSize)
{
*maxStackSize = attr.localSizeBytes;
}
// Test the shared memory limit check (mimics enqueue.cc lines 135-146)
if(ncclMaxSharedMem != 0)
{
int sharedMemSize = ncclMaxSharedMem;
if(sharedMemSize > (maxSharedMem - attr.sharedSizeBytes))
{
WARN(
"cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
cudaArch,
sharedMemSize,
maxSharedMem - attr.sharedSizeBytes
);
return ncclSystemError;
}
}
return result;
}
// Helper structure to hold test environment
struct EnqueueTestEnvironment
{
ncclComm* comm;
ncclInfo* info;
void* sendbuff;
void* recvbuff;
static uint32_t abortFlag0, abortFlag1;
static int abortFlagRefCount;
void* sendbuff;
void* recvbuff;
uint32_t abortFlag0;
uint32_t abortFlag1;
int abortFlagRefCount;
void SetUp() override {
EnqueueTestEnvironment()
: comm(nullptr)
, info(nullptr)
, sendbuff(nullptr)
, recvbuff(nullptr)
, abortFlag0(0)
, abortFlag1(0)
, abortFlagRefCount(0)
{}
~EnqueueTestEnvironment()
{
cleanup();
}
void setup()
{
// Allocate GPU memory for buffers
size_t bufferSize = 1024 * sizeof(float);
hipError_t hipErr = hipMalloc(&sendbuff, bufferSize);
size_t bufferSize = 1024 * sizeof(float);
hipError_t hipErr = hipMalloc(&sendbuff, bufferSize);
ASSERT_EQ(hipErr, hipSuccess) << "Failed to allocate sendbuff";
hipErr = hipMalloc(&recvbuff, bufferSize);
@@ -34,17 +127,17 @@ protected:
comm = new ncclComm();
memset(comm, 0, sizeof(ncclComm));
comm->startMagic = NCCL_MAGIC; // 0x0280028002800280
comm->startMagic = NCCL_MAGIC; // 0x0280028002800280
// Initialize critical fields
comm->rank = 0;
comm->nRanks = 2;
comm->cudaDev = 0;
comm->rank = 0;
comm->nRanks = 2;
comm->cudaDev = 0;
comm->localRank = 0;
// Initialize abort flags
comm->abortFlag = &abortFlag0;
comm->childAbortFlag = &abortFlag1;
comm->abortFlag = &abortFlag0;
comm->childAbortFlag = &abortFlag1;
comm->abortFlagRefCount = &abortFlagRefCount;
// Initialize memory stack
@@ -53,15 +146,15 @@ protected:
// Initialize intra-communication pointers
comm->intraComm0 = nullptr;
comm->intraNext = nullptr;
comm->intraNext = nullptr;
// Initialize work FIFO structures
comm->workFifoBytes = 1024; // Power of 2
comm->workFifoBuf = nullptr;
comm->workFifoBufDev = nullptr;
comm->workFifoConsumed = 0;
comm->workFifoBytes = 1024; // Power of 2
comm->workFifoBuf = nullptr;
comm->workFifoBufDev = nullptr;
comm->workFifoConsumed = 0;
comm->workFifoProducedLastRecorded = 0;
comm->workFifoProduced = 0;
comm->workFifoProduced = 0;
// Initialize planner
memset(&comm->planner, 0, sizeof(comm->planner));
@@ -69,254 +162,556 @@ protected:
// Initialize config
memset(&comm->config, 0, sizeof(comm->config));
comm->config.blocking = 1;
comm->checkPointers = 0; // Disable pointer validation for easier testing
comm->checkPointers = 0; // Disable pointer validation for easier testing
// Initialize peer info arrays
comm->peerInfo = new ncclPeerInfo[comm->nRanks];
memset(comm->peerInfo, 0, comm->nRanks * sizeof(ncclPeerInfo));
comm->localRankToRank = new int[comm->nRanks];
for (int i = 0; i < comm->nRanks; i++) {
for(int i = 0; i < comm->nRanks; i++)
{
comm->localRankToRank[i] = i;
}
comm->endMagic = NCCL_MAGIC; // 0x0280028002800280
comm->endMagic = NCCL_MAGIC; // 0x0280028002800280
// Initialize operation info with valid GPU buffers
info = new ncclInfo();
memset(info, 0, sizeof(ncclInfo));
info->comm = comm;
info->opName = "AllReduce";
info->count = 1024;
info->comm = comm;
info->opName = "AllReduce";
info->count = 1024;
info->datatype = ncclFloat;
info->op = ncclSum;
info->root = 0;
info->sendbuff = sendbuff; // Use allocated GPU memory
info->recvbuff = recvbuff; // Use allocated GPU memory
info->stream = nullptr;
info->op = ncclSum;
info->root = 0;
info->sendbuff = sendbuff; // Use allocated GPU memory
info->recvbuff = recvbuff; // Use allocated GPU memory
info->stream = nullptr;
}
void TearDown() override {
if (sendbuff) {
hipFree(sendbuff);
void cleanup()
{
// Clean up info first (it references comm)
if(info)
{
delete info;
info = nullptr;
}
if (recvbuff) {
hipFree(recvbuff);
}
if (comm) {
// Clean up comm and its allocated resources
if(comm)
{
// Clean up memory stacks
ncclMemoryStackDestruct(&comm->memScoped);
ncclMemoryStackDestruct(&comm->memPermanent);
delete[] comm->peerInfo;
delete[] comm->localRankToRank;
// Clean up peer info arrays
if(comm->peerInfo)
{
delete[] comm->peerInfo;
comm->peerInfo = nullptr;
}
if(comm->localRankToRank)
{
delete[] comm->localRankToRank;
comm->localRankToRank = nullptr;
}
delete comm;
comm = nullptr;
}
if (info) {
delete info;
// Clean up GPU buffers last
if(sendbuff)
{
hipError_t err = hipFree(sendbuff);
if(err != hipSuccess)
{
// Log error but don't throw in cleanup
fprintf(stderr, "Warning: hipFree(sendbuff) failed with error %d\n", err);
}
sendbuff = nullptr;
}
if(recvbuff)
{
hipError_t err = hipFree(recvbuff);
if(err != hipSuccess)
{
// Log error but don't throw in cleanup
fprintf(stderr, "Warning: hipFree(recvbuff) failed with error %d\n", err);
}
recvbuff = nullptr;
}
}
};
// Static member definitions
uint32_t EnqueueTests::abortFlag0 = 0;
uint32_t EnqueueTests::abortFlag1 = 0;
int EnqueueTests::abortFlagRefCount = 0;
// Empty test fixture for test organization
class EnqueueTests : public ::testing::Test
{
// No setup/teardown - all tests use process isolation
};
// Test ncclInitKernelsForDevice function
TEST_F(EnqueueTests, ncclInitKernelsForDevice_ValidInput) {
size_t maxStackSize = 0;
ncclResult_t result = ncclInitKernelsForDevice(906, 65536, &maxStackSize);
TEST_F(EnqueueTests, ncclInitKernelsForDevice_ValidInput)
{
ProcessIsolatedTestRunner::ExecutionOptions options;
options.stopOnFirstFailure = false; // Continue running all tests
options.verboseLogging = true;
EXPECT_TRUE(result == ncclSuccess);
EXPECT_GT(maxStackSize, 0);
RUN_ISOLATED_TESTS_WITH_OPTIONS(
options,
ProcessIsolatedTestRunner::TestConfig(
"ncclInitKernelsForDevice_ValidInput",
[this]()
{
size_t maxStackSize = 0;
ncclResult_t result = ncclInitKernelsForDevice(906, 65536, &maxStackSize);
EXPECT_TRUE(result == ncclSuccess);
// maxStackSize should be set to a reasonable value (> 0)
EXPECT_GT(maxStackSize, 0)
<< "Expected maxStackSize to be computed and set to a positive value";
}
).withEnvironment({{"NCCL_DEBUG", "INFO"}, {"NCCL_DEBUG_SUBSYS", "ALL"}}),
ProcessIsolatedTestRunner::TestConfig(
"ncclInitKernelsForDevice_ValidInputCarveout",
[this]()
{
size_t maxStackSize = 0;
ncclResult_t result = ncclInitKernelsForDevice(906, 65536, &maxStackSize);
EXPECT_TRUE(result == ncclSuccess);
// maxStackSize should be set to a reasonable value (> 0)
EXPECT_GT(maxStackSize, 0)
<< "Expected maxStackSize to be computed and set to a positive value";
}
)
.withEnvironment(
{{"NCCL_L1_SHARED_MEMORY_CARVEOUT", "1"},
{"NCCL_DEBUG", "INFO"},
{"NCCL_DEBUG_SUBSYS", "ALL"}}
)
);
}
TEST_F(EnqueueTests, ncclInitKernelsForDevice_NullStackSize) {
ncclResult_t result = ncclInitKernelsForDevice(906, 65536, nullptr);
TEST_F(EnqueueTests, ncclInitKernelsForDevice_NullStackSize)
{
ProcessIsolatedTestRunner::ExecutionOptions options;
options.stopOnFirstFailure = false;
options.verboseLogging = true;
EXPECT_EQ(result, ncclSuccess);
RUN_ISOLATED_TESTS_WITH_OPTIONS(
options,
ProcessIsolatedTestRunner::TestConfig(
"ncclInitKernelsForDevice_NullStackSize",
[]()
{
ncclResult_t result = ncclInitKernelsForDevice(906, 65536, nullptr);
EXPECT_EQ(result, ncclSuccess);
}
)
);
}
TEST_F(EnqueueTests, ncclInitKernelsForDevice_InvalidArch) {
size_t maxStackSize = 0;
ncclResult_t result = ncclInitKernelsForDevice(-1, 65536, &maxStackSize);
EXPECT_EQ(result, ncclSuccess);
// Test with a real compiled kernel to verify attribute retrieval works correctly
TEST_F(EnqueueTests, KernelAttributes_WithRealKernel)
{
ProcessIsolatedTestRunner::ExecutionOptions options;
options.stopOnFirstFailure = false;
options.verboseLogging = true;
RUN_ISOLATED_TESTS_WITH_OPTIONS(
options,
ProcessIsolatedTestRunner::TestConfig(
"KernelAttributes_WithRealKernel",
[]()
{
size_t maxStackSize = 0;
ncclResult_t result = testKernelAttributes((void*)simpleTestKernel, &maxStackSize);
EXPECT_EQ(result, ncclSuccess)
<< "Expected successful kernel attribute retrieval with a real compiled kernel";
}
).withEnvironment({{"NCCL_DEBUG", "INFO"}})
);
}
TEST_F(EnqueueTests, ncclInitKernelsForDevice_ExceedsSharedMemory) {
size_t maxStackSize = 0;
TEST_F(EnqueueTests, ncclInitKernelsForDevice_InvalidArch)
{
ProcessIsolatedTestRunner::ExecutionOptions options;
options.stopOnFirstFailure = false;
options.verboseLogging = true;
ncclResult_t result = ncclInitKernelsForDevice(906, 32832, &maxStackSize);
EXPECT_TRUE(result == ncclSystemError);
RUN_ISOLATED_TESTS_WITH_OPTIONS(
options,
ProcessIsolatedTestRunner::TestConfig(
"ncclInitKernelsForDevice_InvalidArch",
[]()
{
size_t maxStackSize = 0;
ncclResult_t result = ncclInitKernelsForDevice(-1, 65536, &maxStackSize);
EXPECT_EQ(result, ncclSuccess);
}
)
);
}
TEST_F(EnqueueTests, ncclInitKernelsForDevice_ExceedsSharedMemory)
{
ProcessIsolatedTestRunner::ExecutionOptions options;
options.stopOnFirstFailure = false;
options.verboseLogging = true;
RUN_ISOLATED_TESTS_WITH_OPTIONS(
options,
ProcessIsolatedTestRunner::TestConfig(
"ncclInitKernelsForDevice_ExceedsSharedMemory",
[]()
{
size_t maxStackSize = 0;
// For gfx906, ncclMaxSharedMem is 32832 (as shown in test output)
// Use a very small maxSharedMem (16000 bytes) to trigger the exceeds check
ncclResult_t result = testKernelSharedMemoryLimit(
(void*)simpleTestKernel, // Use our real compiled kernel
906, // cudaArch
16000, // maxSharedMem (intentionally too small)
&maxStackSize,
32832 // ncclMaxSharedMem for gfx906
);
EXPECT_EQ(result, ncclSystemError)
<< "Expected ncclSystemError when ncclMaxSharedMem exceeds maxSharedMem";
}
).withEnvironment({{"NCCL_DEBUG", "WARN"}})
);
}
// Test ncclEnqueueCheck function
TEST_F(EnqueueTests, ncclEnqueueCheck_ValidInput) {
ncclResult_t result = ncclEnqueueCheck(info);
EXPECT_TRUE(result == ncclSuccess);
TEST_F(EnqueueTests, ncclEnqueueCheck_ValidInput)
{
ProcessIsolatedTestRunner::ExecutionOptions options;
options.stopOnFirstFailure = false;
options.verboseLogging = true;
RUN_ISOLATED_TESTS_WITH_OPTIONS(
options,
ProcessIsolatedTestRunner::TestConfig(
"ncclEnqueueCheck_ValidInput",
[]()
{
EnqueueTestEnvironment env;
env.setup();
ncclResult_t result = ncclEnqueueCheck(env.info);
EXPECT_TRUE(result == ncclSuccess);
env.cleanup();
}
)
);
}
TEST_F(EnqueueTests, ncclEnqueueCheck_InvalidComm) {
info->comm = nullptr;
ncclResult_t result = ncclEnqueueCheck(info);
EXPECT_EQ(result, ncclInvalidArgument);
TEST_F(EnqueueTests, ncclEnqueueCheck_InvalidComm)
{
ProcessIsolatedTestRunner::ExecutionOptions options;
options.stopOnFirstFailure = false;
options.verboseLogging = true;
RUN_ISOLATED_TESTS_WITH_OPTIONS(
options,
ProcessIsolatedTestRunner::TestConfig(
"ncclEnqueueCheck_InvalidComm",
[]()
{
EnqueueTestEnvironment env;
env.setup();
env.info->comm = nullptr;
ncclResult_t result = ncclEnqueueCheck(env.info);
EXPECT_EQ(result, ncclInvalidArgument);
env.cleanup();
}
)
);
}
TEST_F(EnqueueTests, ncclEnqueueCheck_InvalidBuffers) {
// Test with null sendbuff
comm->checkPointers = 1;
info->sendbuff = nullptr;
ncclResult_t result = ncclEnqueueCheck(info);
EXPECT_EQ(result, ncclInvalidArgument);
TEST_F(EnqueueTests, ncclEnqueueCheck_InvalidBuffers)
{
ProcessIsolatedTestRunner::ExecutionOptions options;
options.stopOnFirstFailure = false;
options.verboseLogging = true;
// Reset sendbuff and test with null recvbuff
info->sendbuff = sendbuff;
info->recvbuff = nullptr;
result = ncclEnqueueCheck(info);
EXPECT_EQ(result, ncclInvalidArgument);
RUN_ISOLATED_TESTS_WITH_OPTIONS(
options,
ProcessIsolatedTestRunner::TestConfig(
"ncclEnqueueCheck_InvalidBuffers",
[]()
{
EnqueueTestEnvironment env;
env.setup();
// Test with null sendbuff
env.comm->checkPointers = 1;
env.info->sendbuff = nullptr;
ncclResult_t result = ncclEnqueueCheck(env.info);
EXPECT_EQ(result, ncclInvalidArgument);
// Reset sendbuff and test with null recvbuff
env.info->sendbuff = env.sendbuff;
env.info->recvbuff = nullptr;
result = ncclEnqueueCheck(env.info);
EXPECT_EQ(result, ncclInvalidArgument);
env.cleanup();
}
)
);
}
// Test ncclFuncSendCount function
TEST_F(EnqueueTests, ncclFuncSendCount_AllReduce) {
size_t count = 1000;
int nRanks = 4;
TEST_F(EnqueueTests, ncclFuncSendCount_AllTests)
{
ProcessIsolatedTestRunner::ExecutionOptions options;
options.stopOnFirstFailure = false;
options.verboseLogging = true;
size_t result = ncclFuncSendCount(ncclFuncAllReduce, nRanks, count);
EXPECT_EQ(result, count);
}
RUN_ISOLATED_TESTS_WITH_OPTIONS(
options,
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncSendCount_AllReduce",
[]()
{
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncSendCount(ncclFuncAllReduce, nRanks, count);
EXPECT_EQ(result, count);
}
),
TEST_F(EnqueueTests, ncclFuncSendCount_Broadcast) {
size_t count = 1000;
int nRanks = 4;
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncSendCount_Broadcast",
[]()
{
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncSendCount(ncclFuncBroadcast, nRanks, count);
EXPECT_EQ(result, count);
}
),
size_t result = ncclFuncSendCount(ncclFuncBroadcast, nRanks, count);
EXPECT_EQ(result, count);
}
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncSendCount_Reduce",
[]()
{
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncSendCount(ncclFuncReduce, nRanks, count);
EXPECT_EQ(result, count);
}
),
TEST_F(EnqueueTests, ncclFuncSendCount_Reduce) {
size_t count = 1000;
int nRanks = 4;
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncSendCount_AllGather",
[]()
{
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncSendCount(ncclFuncAllGather, nRanks, count);
EXPECT_EQ(result, count);
}
),
size_t result = ncclFuncSendCount(ncclFuncReduce, nRanks, count);
EXPECT_EQ(result, count);
}
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncSendCount_ReduceScatter",
[]()
{
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncSendCount(ncclFuncReduceScatter, nRanks, count);
EXPECT_EQ(result, count * nRanks);
}
),
TEST_F(EnqueueTests, ncclFuncSendCount_AllGather) {
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncSendCount(ncclFuncAllGather, nRanks, count);
EXPECT_EQ(result, count);
}
TEST_F(EnqueueTests, ncclFuncSendCount_ReduceScatter) {
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncSendCount(ncclFuncReduceScatter, nRanks, count);
EXPECT_EQ(result, count * nRanks);
}
TEST_F(EnqueueTests, ncclFuncSendCount_ZeroCount) {
size_t result = ncclFuncSendCount(ncclFuncAllReduce, 4, 0);
EXPECT_EQ(result, 0);
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncSendCount_ZeroCount",
[]()
{
size_t result = ncclFuncSendCount(ncclFuncAllReduce, 4, 0);
EXPECT_EQ(result, 0);
}
)
);
}
// Test ncclFuncRecvCount function
TEST_F(EnqueueTests, ncclFuncRecvCount_AllReduce) {
size_t count = 1000;
int nRanks = 4;
TEST_F(EnqueueTests, ncclFuncRecvCount_AllTests)
{
ProcessIsolatedTestRunner::ExecutionOptions options;
options.stopOnFirstFailure = false;
options.verboseLogging = true;
size_t result = ncclFuncRecvCount(ncclFuncAllReduce, nRanks, count);
EXPECT_EQ(result, count);
}
RUN_ISOLATED_TESTS_WITH_OPTIONS(
options,
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncRecvCount_AllReduce",
[]()
{
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncRecvCount(ncclFuncAllReduce, nRanks, count);
EXPECT_EQ(result, count);
}
),
TEST_F(EnqueueTests, ncclFuncRecvCount_Broadcast) {
size_t count = 1000;
int nRanks = 4;
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncRecvCount_Broadcast",
[]()
{
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncRecvCount(ncclFuncBroadcast, nRanks, count);
EXPECT_EQ(result, count);
}
),
size_t result = ncclFuncRecvCount(ncclFuncBroadcast, nRanks, count);
EXPECT_EQ(result, count);
}
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncRecvCount_Reduce",
[]()
{
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncRecvCount(ncclFuncReduce, nRanks, count);
EXPECT_EQ(result, count);
}
),
TEST_F(EnqueueTests, ncclFuncRecvCount_Reduce) {
size_t count = 1000;
int nRanks = 4;
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncRecvCount_AllGather",
[]()
{
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncRecvCount(ncclFuncAllGather, nRanks, count);
EXPECT_EQ(result, count * nRanks);
}
),
size_t result = ncclFuncRecvCount(ncclFuncReduce, nRanks, count);
EXPECT_EQ(result, count);
}
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncRecvCount_ReduceScatter",
[]()
{
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncRecvCount(ncclFuncReduceScatter, nRanks, count);
EXPECT_EQ(result, count);
}
),
TEST_F(EnqueueTests, ncclFuncRecvCount_AllGather) {
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncRecvCount(ncclFuncAllGather, nRanks, count);
EXPECT_EQ(result, count * nRanks);
}
TEST_F(EnqueueTests, ncclFuncRecvCount_ReduceScatter) {
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncRecvCount(ncclFuncReduceScatter, nRanks, count);
EXPECT_EQ(result, count);
}
TEST_F(EnqueueTests, ncclFuncRecvCount_ZeroCount) {
size_t result = ncclFuncRecvCount(ncclFuncAllReduce, 4, 0);
EXPECT_EQ(result, 0);
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncRecvCount_ZeroCount",
[]()
{
size_t result = ncclFuncRecvCount(ncclFuncAllReduce, 4, 0);
EXPECT_EQ(result, 0);
}
)
);
}
// Test ncclFuncMaxSendRecvCount function
TEST_F(EnqueueTests, ncclFuncMaxSendRecvCount_AllReduce) {
size_t count = 1000;
int nRanks = 4;
TEST_F(EnqueueTests, ncclFuncMaxSendRecvCount_AllTests)
{
ProcessIsolatedTestRunner::ExecutionOptions options;
options.stopOnFirstFailure = false;
options.verboseLogging = true;
size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllReduce, nRanks, count);
EXPECT_EQ(result, count);
}
RUN_ISOLATED_TESTS_WITH_OPTIONS(
options,
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncMaxSendRecvCount_AllReduce",
[]()
{
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllReduce, nRanks, count);
EXPECT_EQ(result, count);
}
),
TEST_F(EnqueueTests, ncclFuncMaxSendRecvCount_AllGather) {
size_t count = 1000;
int nRanks = 4;
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncMaxSendRecvCount_AllGather",
[]()
{
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllGather, nRanks, count);
// For AllGather, receive count (count * nRanks) is larger than send count (count)
EXPECT_EQ(result, count * nRanks);
}
),
size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllGather, nRanks, count);
// For AllGather, receive count (count * nRanks) is larger than send count (count)
EXPECT_EQ(result, count * nRanks);
}
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncMaxSendRecvCount_ReduceScatter",
[]()
{
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncMaxSendRecvCount(ncclFuncReduceScatter, nRanks, count);
// For ReduceScatter, send count (count) is larger than receive count (count/nRanks)
EXPECT_EQ(result, count * nRanks);
}
),
TEST_F(EnqueueTests, ncclFuncMaxSendRecvCount_ReduceScatter) {
size_t count = 1000;
int nRanks = 4;
size_t result = ncclFuncMaxSendRecvCount(ncclFuncReduceScatter, nRanks, count);
// For ReduceScatter, send count (count) is larger than receive count (count/nRanks)
EXPECT_EQ(result, count * nRanks);
}
TEST_F(EnqueueTests, ncclFuncMaxSendRecvCount_ZeroCount) {
size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllReduce, 4, 0);
EXPECT_EQ(result, 0);
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncMaxSendRecvCount_ZeroCount",
[]()
{
size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllReduce, 4, 0);
EXPECT_EQ(result, 0);
}
)
);
}
// Edge case tests
TEST_F(EnqueueTests, ncclFuncCounts_SingleRank) {
size_t count = 1000;
int nRanks = 1;
TEST_F(EnqueueTests, ncclFuncCounts_EdgeCases)
{
ProcessIsolatedTestRunner::ExecutionOptions options;
options.stopOnFirstFailure = false;
options.verboseLogging = true;
// Test with single rank
EXPECT_EQ(ncclFuncSendCount(ncclFuncAllReduce, nRanks, count), count);
EXPECT_EQ(ncclFuncRecvCount(ncclFuncAllReduce, nRanks, count), count);
EXPECT_EQ(ncclFuncMaxSendRecvCount(ncclFuncAllReduce, nRanks, count), count);
RUN_ISOLATED_TESTS_WITH_OPTIONS(
options,
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncCounts_SingleRank",
[]()
{
size_t count = 1000;
int nRanks = 1;
// Test with single rank
EXPECT_EQ(ncclFuncSendCount(ncclFuncAllReduce, nRanks, count), count);
EXPECT_EQ(ncclFuncRecvCount(ncclFuncAllReduce, nRanks, count), count);
EXPECT_EQ(ncclFuncMaxSendRecvCount(ncclFuncAllReduce, nRanks, count), count);
}
),
ProcessIsolatedTestRunner::TestConfig(
"ncclFuncCounts_LargeRankCount",
[]()
{
size_t count = 1000;
int nRanks = 1024;
// Test with large number of ranks
EXPECT_EQ(ncclFuncSendCount(ncclFuncAllGather, nRanks, count), count);
EXPECT_EQ(ncclFuncRecvCount(ncclFuncAllGather, nRanks, count), count * nRanks);
EXPECT_EQ(
ncclFuncMaxSendRecvCount(ncclFuncAllGather, nRanks, count),
count * nRanks
);
}
)
);
}
TEST_F(EnqueueTests, ncclFuncCounts_LargeRankCount) {
size_t count = 1000;
int nRanks = 1024;
// Test with large number of ranks
EXPECT_EQ(ncclFuncSendCount(ncclFuncAllGather, nRanks, count), count);
EXPECT_EQ(ncclFuncRecvCount(ncclFuncAllGather, nRanks, count), count * nRanks);
EXPECT_EQ(ncclFuncMaxSendRecvCount(ncclFuncAllGather, nRanks, count), count * nRanks);
}
} // namespace RcclUnitTesting
+379 -282
Просмотреть файл
@@ -3,8 +3,8 @@
*
* See LICENSE.txt for license information
************************************************************************/
#include "net.h"
#include "common/ProcessIsolatedTestRunner.hpp"
#include "gtest/gtest.h"
#include <atomic>
#include <cstring>
@@ -612,6 +612,169 @@ protected:
return static_cast<int>(result);
}
void RunConcurrentOperationsTaskCreationWithEnvVars() {
INFO(NCCL_LOG_INFO, "Checking socket configuration environment variables");
// Check if the required environment variables are set
const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS");
const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD");
if (!nThreadsEnv || !nSocksPerThreadEnv) {
GTEST_SKIP() << "SKIPPING TEST: Required environment variables not set. "
<< "Please set the following environment variables to run this test: "
<< "export NCCL_SOCKET_NTHREADS=1 and export NCCL_NSOCKS_PERTHREAD=2. "
<< "This ensures nSocks > 0 so that ncclNetSocketGetTask gets called. "
<< "Environment variables NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be set";
return;
}
int nThreads = ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1);
int nSocksPerThread = ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1);
// Additional validation for reasonable upper bounds
const int MAX_THREADS = 16;
const int MAX_SOCKS_PER_THREAD = 64;
const int MAX_TOTAL_SOCKETS = 64;
if (nThreads > MAX_THREADS) {
GTEST_SKIP() << "SKIPPING TEST: NCCL_SOCKET_NTHREADS=" << nThreads << " exceeds maximum " << MAX_THREADS << ". "
<< "Please provide a reasonable value (e.g., NCCL_SOCKET_NTHREADS=8). "
<< "Values too large may cause resource exhaustion.";
return;
}
if (nSocksPerThread > MAX_SOCKS_PER_THREAD) {
GTEST_SKIP() << "SKIPPING TEST: NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread << " exceeds maximum " << MAX_SOCKS_PER_THREAD << ". "
<< "Please provide a reasonable value (e.g., NCCL_NSOCKS_PERTHREAD=4). "
<< "Values too large may cause resource exhaustion.";
return;
}
// Check for potential overflow before multiplication
if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) {
GTEST_SKIP() << "SKIPPING TEST: Configuration would cause integer overflow. "
<< "NCCL_SOCKET_NTHREADS=" << nThreads << " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread
<< " exceeds maximum integer value. Please use smaller values.";
return;
}
int totalSockets = nThreads * nSocksPerThread;
INFO(NCCL_LOG_INFO, "Environment configuration found:");
INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads);
INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread);
INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets);
// Validate total sockets count
if (totalSockets <= 0) {
GTEST_SKIP() << "SKIPPING TEST: Invalid configuration - total sockets must be > 0. "
<< "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread
<< " = " << totalSockets << ". "
<< "Both NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be positive integers. "
<< "Example: export NCCL_SOCKET_NTHREADS=2 && export NCCL_NSOCKS_PERTHREAD=2";
return;
}
if (totalSockets > MAX_TOTAL_SOCKETS) {
GTEST_SKIP() << "SKIPPING TEST: Total sockets " << totalSockets << " exceeds maximum " << MAX_TOTAL_SOCKETS << ". "
<< "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread
<< " = " << totalSockets << ". "
<< "Please reduce either NCCL_SOCKET_NTHREADS or NCCL_NSOCKS_PERTHREAD. "
<< "Example: export NCCL_SOCKET_NTHREADS=8 && export NCCL_NSOCKS_PERTHREAD=4";
return;
}
if (totalSockets > NCCL_NET_MAX_REQUESTS) {
GTEST_SKIP() << "SKIPPING TEST: Total sockets " << totalSockets << " exceeds NCCL_NET_MAX_REQUESTS=" << NCCL_NET_MAX_REQUESTS << ". "
<< "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread
<< " = " << totalSockets << ". "
<< "NCCL network layer can handle at most " << NCCL_NET_MAX_REQUESTS << " concurrent requests. "
<< "Please reduce configuration to stay within NCCL limits.";
return;
}
INFO(NCCL_LOG_INFO, "Configuration valid - proceeding with test to exercise "
"ncclNetSocketGetTask");
// Test socket properties
TestSocketProperties();
char handle[NCCL_NET_HANDLE_MAXSIZE];
void *listenComm = nullptr;
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
ASSERT_EQ(result, ncclSuccess) << "Failed to establish listening socket for test execution. "
<< "ncclNetSocket.listen() returned error code: " << result
<< ". Verify network device availability and port accessibility.";
INFO(NCCL_LOG_INFO, "Testing task creation functionality - ensuring "
"ncclNetSocketGetTask is called");
std::vector<void *> sendComms;
std::vector<void *> recvComms;
// Establish connection
void *sendComm = nullptr;
void *recvComm = nullptr;
bool connectionSuccess =
EstablishConnectionPair(handle, listenComm, sendComm, recvComm);
if (connectionSuccess) {
sendComms.push_back(sendComm);
recvComms.push_back(recvComm);
// Test with buffer sizes that will trigger task subdivision
std::vector<size_t> testSizes = GetTestSizes();
for (size_t testSize : testSizes) {
INFO(NCCL_LOG_INFO,
"\n=== Testing with buffer size: %zu bytes ===", testSize);
INFO(NCCL_LOG_INFO, "This should trigger ncclNetSocketGetTask to create "
"task subdivision");
std::vector<void *> sendMhandles;
std::vector<void *> recvMhandles;
std::vector<void *> sendRequests;
std::vector<void *> recvRequests;
std::vector<std::vector<char>> sendBuffers;
std::vector<std::vector<char>> recvBuffers;
// Setup operations for this test size
bool setupSuccess = SetupOperationsForSize(
sendComm, recvComm, testSize, sendBuffers, recvBuffers, sendMhandles,
recvMhandles, sendRequests, recvRequests, 0xAB);
if (setupSuccess) {
// Progress operations with context about environment variables
ProgressOperations(sendRequests[0], recvRequests[0], testSize,
" (with nSocks > 0 from environment variables)");
} else {
INFO(NCCL_LOG_INFO,
"No operations started - skipping progress testing for size %zu",
testSize);
}
// Deregister memory
DeregisterMemory(sendComm, recvComm, sendMhandles, recvMhandles,
testSize);
INFO(NCCL_LOG_INFO,
"=== Completed testing for buffer size: %zu bytes ===", testSize);
}
INFO(NCCL_LOG_INFO, "\n*** TEST SUCCESS: ncclNetSocketGetTask was "
"successfully exercised! ***");
} else {
INFO(NCCL_LOG_INFO, "No connections established - test passed (network may "
"not be available)");
}
// Cleanup
CleanupCommunicators(sendComms, recvComms, listenComm);
INFO(NCCL_LOG_INFO,
"TestConcurrentOperationsTaskCreation completed successfully");
}
};
// Test concurrent operations task creation in default configuration (without
@@ -709,166 +872,19 @@ TEST_F(NetSocketTests, TestConcurrentOperationsTaskCreationDefault) {
// Test multiple concurrent operations to stress test task creation
TEST_F(NetSocketTests, TestConcurrentOperationsTaskCreation) {
INFO(NCCL_LOG_INFO, "Checking socket configuration environment variables");
ProcessIsolatedTestRunner::ExecutionOptions options;
options.stopOnFirstFailure = false; // Continue running all tests
options.verboseLogging = true;
// Check if the required environment variables are set
const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS");
const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD");
if (!nThreadsEnv || !nSocksPerThreadEnv) {
GTEST_SKIP() << "SKIPPING TEST: Required environment variables not set. "
<< "Please set the following environment variables to run this test: "
<< "export NCCL_SOCKET_NTHREADS=1 and export NCCL_NSOCKS_PERTHREAD=2. "
<< "This ensures nSocks > 0 so that ncclNetSocketGetTask gets called. "
<< "Environment variables NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be set";
return;
}
int nThreads = ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1);
int nSocksPerThread = ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1);
// Additional validation for reasonable upper bounds
const int MAX_THREADS = 16;
const int MAX_SOCKS_PER_THREAD = 64;
const int MAX_TOTAL_SOCKETS = 64;
if (nThreads > MAX_THREADS) {
GTEST_SKIP() << "SKIPPING TEST: NCCL_SOCKET_NTHREADS=" << nThreads << " exceeds maximum " << MAX_THREADS << ". "
<< "Please provide a reasonable value (e.g., NCCL_SOCKET_NTHREADS=8). "
<< "Values too large may cause resource exhaustion.";
return;
}
if (nSocksPerThread > MAX_SOCKS_PER_THREAD) {
GTEST_SKIP() << "SKIPPING TEST: NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread << " exceeds maximum " << MAX_SOCKS_PER_THREAD << ". "
<< "Please provide a reasonable value (e.g., NCCL_NSOCKS_PERTHREAD=4). "
<< "Values too large may cause resource exhaustion.";
return;
}
// Check for potential overflow before multiplication
if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) {
GTEST_SKIP() << "SKIPPING TEST: Configuration would cause integer overflow. "
<< "NCCL_SOCKET_NTHREADS=" << nThreads << " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread
<< " exceeds maximum integer value. Please use smaller values.";
return;
}
int totalSockets = nThreads * nSocksPerThread;
INFO(NCCL_LOG_INFO, "Environment configuration found:");
INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads);
INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread);
INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets);
// Validate total sockets count
if (totalSockets <= 0) {
GTEST_SKIP() << "SKIPPING TEST: Invalid configuration - total sockets must be > 0. "
<< "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread
<< " = " << totalSockets << ". "
<< "Both NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be positive integers. "
<< "Example: export NCCL_SOCKET_NTHREADS=2 && export NCCL_NSOCKS_PERTHREAD=2";
return;
}
if (totalSockets > MAX_TOTAL_SOCKETS) {
GTEST_SKIP() << "SKIPPING TEST: Total sockets " << totalSockets << " exceeds maximum " << MAX_TOTAL_SOCKETS << ". "
<< "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread
<< " = " << totalSockets << ". "
<< "Please reduce either NCCL_SOCKET_NTHREADS or NCCL_NSOCKS_PERTHREAD. "
<< "Example: export NCCL_SOCKET_NTHREADS=8 && export NCCL_NSOCKS_PERTHREAD=4";
return;
}
if (totalSockets > NCCL_NET_MAX_REQUESTS) {
GTEST_SKIP() << "SKIPPING TEST: Total sockets " << totalSockets << " exceeds NCCL_NET_MAX_REQUESTS=" << NCCL_NET_MAX_REQUESTS << ". "
<< "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread
<< " = " << totalSockets << ". "
<< "NCCL network layer can handle at most " << NCCL_NET_MAX_REQUESTS << " concurrent requests. "
<< "Please reduce configuration to stay within NCCL limits.";
return;
}
INFO(NCCL_LOG_INFO, "Configuration valid - proceeding with test to exercise "
"ncclNetSocketGetTask");
// Test socket properties
TestSocketProperties();
char handle[NCCL_NET_HANDLE_MAXSIZE];
void *listenComm = nullptr;
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
ASSERT_EQ(result, ncclSuccess) << "Failed to establish listening socket for test execution. "
<< "ncclNetSocket.listen() returned error code: " << result
<< ". Verify network device availability and port accessibility.";
INFO(NCCL_LOG_INFO, "Testing task creation functionality - ensuring "
"ncclNetSocketGetTask is called");
std::vector<void *> sendComms;
std::vector<void *> recvComms;
// Establish connection
void *sendComm = nullptr;
void *recvComm = nullptr;
bool connectionSuccess =
EstablishConnectionPair(handle, listenComm, sendComm, recvComm);
if (connectionSuccess) {
sendComms.push_back(sendComm);
recvComms.push_back(recvComm);
// Test with buffer sizes that will trigger task subdivision
std::vector<size_t> testSizes = GetTestSizes();
for (size_t testSize : testSizes) {
INFO(NCCL_LOG_INFO,
"\n=== Testing with buffer size: %zu bytes ===", testSize);
INFO(NCCL_LOG_INFO, "This should trigger ncclNetSocketGetTask to create "
"task subdivision");
std::vector<void *> sendMhandles;
std::vector<void *> recvMhandles;
std::vector<void *> sendRequests;
std::vector<void *> recvRequests;
std::vector<std::vector<char>> sendBuffers;
std::vector<std::vector<char>> recvBuffers;
// Setup operations for this test size
bool setupSuccess = SetupOperationsForSize(
sendComm, recvComm, testSize, sendBuffers, recvBuffers, sendMhandles,
recvMhandles, sendRequests, recvRequests, 0xAB);
if (setupSuccess) {
// Progress operations with context about environment variables
ProgressOperations(sendRequests[0], recvRequests[0], testSize,
" (with nSocks > 0 from environment variables)");
} else {
INFO(NCCL_LOG_INFO,
"No operations started - skipping progress testing for size %zu",
testSize);
}
// Deregister memory
DeregisterMemory(sendComm, recvComm, sendMhandles, recvMhandles,
testSize);
INFO(NCCL_LOG_INFO,
"=== Completed testing for buffer size: %zu bytes ===", testSize);
}
INFO(NCCL_LOG_INFO, "\n*** TEST SUCCESS: ncclNetSocketGetTask was "
"successfully exercised! ***");
} else {
INFO(NCCL_LOG_INFO, "No connections established - test passed (network may "
"not be available)");
}
// Cleanup
CleanupCommunicators(sendComms, recvComms, listenComm);
INFO(NCCL_LOG_INFO,
"TestConcurrentOperationsTaskCreation completed successfully");
RUN_ISOLATED_TESTS_WITH_OPTIONS(options,
ProcessIsolatedTestRunner::TestConfig(
"TestConcurrentOperationsTaskCreation",
[this]() { RunConcurrentOperationsTaskCreationWithEnvVars(); })
.withEnvironment({{"NCCL_SOCKET_NTHREADS", "1"},
{"NCCL_NSOCKS_PERTHREAD", "2"},
{"NCCL_DEBUG", "TRACE"},
{"NCCL_DEBUG_SUBSYS", "ALL"}})
);
}
// Test for invalid device index in listen function
@@ -1079,158 +1095,239 @@ TEST_F(NetSocketTests, TestNonHostMemoryRegMr) {
// Test for excessive thread configuration warning
TEST_F(NetSocketTests, TestExcessiveThreadConfig) {
INFO(NCCL_LOG_INFO, "Testing excessive thread configuration warning");
ProcessIsolatedTestRunner::ExecutionOptions options;
options.stopOnFirstFailure = false; // Continue running all tests
options.verboseLogging = true;
// Check if the required environment variables are set
const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS");
const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD");
RUN_ISOLATED_TESTS_WITH_OPTIONS(options,
ProcessIsolatedTestRunner::TestConfig(
"TestExcessiveThreadConfig",
[this]() {
INFO(NCCL_LOG_INFO,
"Testing excessive thread configuration warning");
if (!nThreadsEnv || !nSocksPerThreadEnv) {
GTEST_SKIP() << "SKIPPING TEST: Required environment variables not set. "
<< "This test requires NCCL_SOCKET_NTHREADS > NCCL_NET_MAX_REQUESTS (" << NCCL_NET_MAX_REQUESTS << ") and NCCL_NSOCKS_PERTHREAD = 1 to trigger warning. "
<< "Environment variables NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be set";
return;
}
// Check if the required environment variables are set
const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS");
const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD");
// Parse with validation - both must be positive
int nThreads = ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1);
int nSocksPerThread = ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1);
if (!nThreadsEnv || !nSocksPerThreadEnv) {
GTEST_SKIP()
<< "SKIPPING TEST: Required environment variables not set. "
<< "This test requires NCCL_SOCKET_NTHREADS > "
"NCCL_NET_MAX_REQUESTS ("
<< NCCL_NET_MAX_REQUESTS
<< ") and NCCL_NSOCKS_PERTHREAD = 1 to trigger warning. "
<< "Environment variables NCCL_SOCKET_NTHREADS and "
"NCCL_NSOCKS_PERTHREAD must be set";
return;
}
// Check for potential overflow before multiplication
if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) {
GTEST_SKIP() << "SKIPPING TEST: Configuration would cause integer overflow. "
<< "NCCL_SOCKET_NTHREADS=" << nThreads << " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread
<< " exceeds maximum integer value. Please use smaller values.";
return;
}
// Parse with validation - both must be positive
int nThreads =
ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1);
int nSocksPerThread =
ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1);
int totalSockets = nThreads * nSocksPerThread;
// Check for potential overflow before multiplication
if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) {
GTEST_SKIP() << "SKIPPING TEST: Configuration would cause "
"integer overflow. "
<< "NCCL_SOCKET_NTHREADS=" << nThreads
<< " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread
<< " exceeds maximum integer value. Please use "
"smaller values.";
return;
}
INFO(NCCL_LOG_INFO, "Environment configuration found:");
INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads);
INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread);
INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets);
int totalSockets = nThreads * nSocksPerThread;
// Check if configuration is set to trigger the excessive threads warning
// Use NCCL_NET_MAX_REQUESTS instead of arbitrary MAX_THREADS
if (nThreads <= NCCL_NET_MAX_REQUESTS) {
GTEST_SKIP() << "SKIPPING TEST: NCCL_SOCKET_NTHREADS must be > " << NCCL_NET_MAX_REQUESTS << " to test excessive thread warning. "
<< "Current NCCL_SOCKET_NTHREADS=" << nThreads << ". "
<< "Please set: export NCCL_SOCKET_NTHREADS=" << (NCCL_NET_MAX_REQUESTS + 1) << ". "
<< "NCCL_SOCKET_NTHREADS must be > NCCL_NET_MAX_REQUESTS (" << NCCL_NET_MAX_REQUESTS << ") to trigger warning";
return;
}
INFO(NCCL_LOG_INFO, "Environment configuration found:");
INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads);
INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread);
INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets);
if (totalSockets > NCCL_NET_MAX_REQUESTS * 10) { // Allow 10x for testing excessive config
GTEST_SKIP() << "SKIPPING TEST: Total sockets=" << totalSockets << " is unreasonably large (> " << (NCCL_NET_MAX_REQUESTS * 10) << "). "
<< "Please use more reasonable values for testing. NCCL_NET_MAX_REQUESTS=" << NCCL_NET_MAX_REQUESTS << ". "
<< "Example: export NCCL_SOCKET_NTHREADS=" << (NCCL_NET_MAX_REQUESTS + 1) << " && export NCCL_NSOCKS_PERTHREAD=1";
return;
}
// Check if configuration is set to trigger the excessive threads
// warning Use NCCL_NET_MAX_REQUESTS instead of arbitrary
// MAX_THREADS
if (nThreads <= NCCL_NET_MAX_REQUESTS) {
GTEST_SKIP()
<< "SKIPPING TEST: NCCL_SOCKET_NTHREADS must be > "
<< NCCL_NET_MAX_REQUESTS
<< " to test excessive thread warning. "
<< "Current NCCL_SOCKET_NTHREADS=" << nThreads << ". "
<< "Please set: export NCCL_SOCKET_NTHREADS="
<< (NCCL_NET_MAX_REQUESTS + 1) << ". "
<< "NCCL_SOCKET_NTHREADS must be > NCCL_NET_MAX_REQUESTS ("
<< NCCL_NET_MAX_REQUESTS << ") to trigger warning";
return;
}
INFO(NCCL_LOG_INFO,
"Configuration valid for testing excessive threads warning");
INFO(NCCL_LOG_INFO, "NCCL_SOCKET_NTHREADS=%d > NCCL_NET_MAX_REQUESTS=%d", nThreads, NCCL_NET_MAX_REQUESTS);
if (totalSockets >
NCCL_NET_MAX_REQUESTS *
10) { // Allow 10x for testing excessive config
GTEST_SKIP() << "SKIPPING TEST: Total sockets=" << totalSockets
<< " is unreasonably large (> "
<< (NCCL_NET_MAX_REQUESTS * 10) << "). "
<< "Please use more reasonable values for testing. "
"NCCL_NET_MAX_REQUESTS="
<< NCCL_NET_MAX_REQUESTS << ". "
<< "Example: export NCCL_SOCKET_NTHREADS="
<< (NCCL_NET_MAX_REQUESTS + 1)
<< " && export NCCL_NSOCKS_PERTHREAD=1";
return;
}
// Test socket properties
TestSocketProperties();
INFO(NCCL_LOG_INFO,
"Configuration valid for testing excessive threads warning");
INFO(NCCL_LOG_INFO,
"NCCL_SOCKET_NTHREADS=%d > NCCL_NET_MAX_REQUESTS=%d", nThreads,
NCCL_NET_MAX_REQUESTS);
// Initialize to trigger the warning logic
char handle[NCCL_NET_HANDLE_MAXSIZE];
void *listenComm = nullptr;
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
// Test socket properties
TestSocketProperties();
if (result == ncclSuccess && listenComm) {
// The implementation should have limited the threads to NCCL_NET_MAX_REQUESTS
// internally
INFO(NCCL_LOG_INFO,
"*** SUCCESS: Listen succeeded with excessive NCCL_SOCKET_NTHREADS - "
"limits enforced internally ***");
ncclNetSocket.closeListen(listenComm);
} else {
INFO(NCCL_LOG_INFO, "Listen failed with result: %d", result);
}
// Initialize to trigger the warning logic
char handle[NCCL_NET_HANDLE_MAXSIZE];
void *listenComm = nullptr;
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
INFO(NCCL_LOG_INFO, "TestExcessiveThreadConfig completed");
if (result == ncclSuccess && listenComm) {
// The implementation should have limited the threads to
// NCCL_NET_MAX_REQUESTS internally
INFO(NCCL_LOG_INFO, "*** SUCCESS: Listen succeeded with "
"excessive NCCL_SOCKET_NTHREADS - "
"limits enforced internally ***");
ncclNetSocket.closeListen(listenComm);
} else {
INFO(NCCL_LOG_INFO, "Listen failed with result: %d", result);
}
INFO(NCCL_LOG_INFO, "TestExcessiveThreadConfig completed");
})
.withEnvironment({{"NCCL_SOCKET_NTHREADS", "33"},
{"NCCL_NSOCKS_PERTHREAD", "1"},
{"NCCL_DEBUG", "TRACE"},
{"NCCL_DEBUG_SUBSYS", "ALL"}})
);
}
// Test for excessive socket configuration warning
TEST_F(NetSocketTests, TestExcessiveSocketConfig) {
INFO(NCCL_LOG_INFO, "Testing excessive socket configuration warning");
ProcessIsolatedTestRunner::ExecutionOptions options;
options.stopOnFirstFailure = false; // Continue running all tests
options.verboseLogging = true;
// Check if the required environment variables are set
const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS");
const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD");
RUN_ISOLATED_TESTS_WITH_OPTIONS(options,
ProcessIsolatedTestRunner::TestConfig(
"TestExcessiveThreadConfig",
[this]() {
INFO(NCCL_LOG_INFO,
"Testing excessive socket configuration warning");
if (!nThreadsEnv || !nSocksPerThreadEnv) {
GTEST_SKIP() << "SKIPPING TEST: Required environment variables not set. "
<< "This test requires total sockets (nThreads * nSocksPerThread) > MAX_SOCKETS (64). "
<< "Environment variables NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be set";
return;
}
// Check if the required environment variables are set
const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS");
const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD");
// Parse with validation - both must be positive
int nThreads = ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1);
int nSocksPerThread = ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1);
if (!nThreadsEnv || !nSocksPerThreadEnv) {
GTEST_SKIP()
<< "SKIPPING TEST: Required environment variables not set. "
<< "This test requires total sockets (nThreads * "
"nSocksPerThread) > MAX_SOCKETS (64). "
<< "Environment variables NCCL_SOCKET_NTHREADS and "
"NCCL_NSOCKS_PERTHREAD must be set";
return;
}
// Check for potential overflow before multiplication
if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) {
GTEST_SKIP() << "SKIPPING TEST: Configuration would cause integer overflow. "
<< "NCCL_SOCKET_NTHREADS=" << nThreads << " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread
<< " exceeds maximum integer value. Please use smaller values.";
return;
}
// Parse with validation - both must be positive
int nThreads =
ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1);
int nSocksPerThread =
ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1);
int totalSockets = nThreads * nSocksPerThread;
// Check for potential overflow before multiplication
if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) {
GTEST_SKIP() << "SKIPPING TEST: Configuration would cause "
"integer overflow. "
<< "NCCL_SOCKET_NTHREADS=" << nThreads
<< " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread
<< " exceeds maximum integer value. Please use "
"smaller values.";
return;
}
INFO(NCCL_LOG_INFO, "Environment configuration found:");
INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads);
INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread);
INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets);
int totalSockets = nThreads * nSocksPerThread;
// Check if configuration is set to trigger the excessive sockets warning
const int MAX_SOCKETS = 64;
if (totalSockets <= MAX_SOCKETS) {
GTEST_SKIP() << "SKIPPING TEST: Total sockets must be > " << MAX_SOCKETS << " to test excessive socket warning. "
<< "Current total sockets=" << totalSockets
<< " (nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread << "). "
<< "Please set environment variables such that total > " << MAX_SOCKETS << ", e.g.: "
<< "export NCCL_SOCKET_NTHREADS=9 && export NCCL_NSOCKS_PERTHREAD=8. "
<< "Total sockets must be > MAX_SOCKETS (" << MAX_SOCKETS << ") to trigger warning";
return;
}
INFO(NCCL_LOG_INFO, "Environment configuration found:");
INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads);
INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread);
INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets);
// Additional validation against NCCL_NET_MAX_REQUESTS for reasonable upper bounds
if (totalSockets > NCCL_NET_MAX_REQUESTS * 10) { // Allow 10x for testing excessive config
GTEST_SKIP() << "SKIPPING TEST: Total sockets=" << totalSockets << " is unreasonably large (> " << (NCCL_NET_MAX_REQUESTS * 10) << "). "
<< "Please use more reasonable values for testing. NCCL_NET_MAX_REQUESTS=" << NCCL_NET_MAX_REQUESTS << ". "
<< "Example: export NCCL_SOCKET_NTHREADS=10 && export NCCL_NSOCKS_PERTHREAD=10";
return;
}
// Check if configuration is set to trigger the excessive sockets
// warning
const int MAX_SOCKETS = 64;
if (totalSockets <= MAX_SOCKETS) {
GTEST_SKIP()
<< "SKIPPING TEST: Total sockets must be > " << MAX_SOCKETS
<< " to test excessive socket warning. "
<< "Current total sockets=" << totalSockets
<< " (nThreads=" << nThreads
<< " * nSocksPerThread=" << nSocksPerThread << "). "
<< "Please set environment variables such that total > "
<< MAX_SOCKETS << ", e.g.: "
<< "export NCCL_SOCKET_NTHREADS=9 && export "
"NCCL_NSOCKS_PERTHREAD=8. "
<< "Total sockets must be > MAX_SOCKETS (" << MAX_SOCKETS
<< ") to trigger warning";
return;
}
INFO(NCCL_LOG_INFO,
"Configuration valid for testing excessive sockets warning");
INFO(NCCL_LOG_INFO, "Total sockets=%d > MAX_SOCKETS=64", totalSockets);
// Additional validation against NCCL_NET_MAX_REQUESTS for
// reasonable upper bounds
if (totalSockets >
NCCL_NET_MAX_REQUESTS *
10) { // Allow 10x for testing excessive config
GTEST_SKIP() << "SKIPPING TEST: Total sockets=" << totalSockets
<< " is unreasonably large (> "
<< (NCCL_NET_MAX_REQUESTS * 10) << "). "
<< "Please use more reasonable values for testing. "
"NCCL_NET_MAX_REQUESTS="
<< NCCL_NET_MAX_REQUESTS << ". "
<< "Example: export NCCL_SOCKET_NTHREADS=10 && "
"export NCCL_NSOCKS_PERTHREAD=10";
return;
}
// Test socket properties
TestSocketProperties();
INFO(NCCL_LOG_INFO,
"Configuration valid for testing excessive sockets warning");
INFO(NCCL_LOG_INFO, "Total sockets=%d > MAX_SOCKETS=64",
totalSockets);
// Initialize to trigger the warning logic
char handle[NCCL_NET_HANDLE_MAXSIZE];
void *listenComm = nullptr;
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
// Test socket properties
TestSocketProperties();
if (result == ncclSuccess && listenComm) {
// The implementation should have limited the sockets to MAX_SOCKETS
// internally
INFO(NCCL_LOG_INFO, "*** SUCCESS: Listen succeeded with excessive total "
"sockets - limits enforced internally ***");
ncclNetSocket.closeListen(listenComm);
} else {
INFO(NCCL_LOG_INFO, "Listen failed with result: %d", result);
}
// Initialize to trigger the warning logic
char handle[NCCL_NET_HANDLE_MAXSIZE];
void *listenComm = nullptr;
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
INFO(NCCL_LOG_INFO, "TestExcessiveSocketConfig completed");
if (result == ncclSuccess && listenComm) {
// The implementation should have limited the sockets to
// MAX_SOCKETS internally
INFO(NCCL_LOG_INFO,
"*** SUCCESS: Listen succeeded with excessive total "
"sockets - limits enforced internally ***");
ncclNetSocket.closeListen(listenComm);
} else {
INFO(NCCL_LOG_INFO, "Listen failed with result: %d", result);
}
INFO(NCCL_LOG_INFO, "TestExcessiveSocketConfig completed");
})
.withEnvironment({{"NCCL_SOCKET_NTHREADS", "10"},
{"NCCL_NSOCKS_PERTHREAD", "10"},
{"NCCL_DEBUG", "TRACE"},
{"NCCL_DEBUG_SUBSYS", "ALL"}})
);
}
// Test to trigger request allocation failure scenario
+351 -299
Просмотреть файл
@@ -3,20 +3,14 @@
*
* See LICENSE.txt for license information
************************************************************************/
#include "gtest/gtest.h"
#include "collectives.h"
#include "comm.h"
#include "gtest/gtest.h"
#include "info.h"
#include "profiler.h"
#include "shmutils.h"
#include "socket.h"
#define ENABLE_TIMER 0
#include "profiler.h"
#include "proxy.h"
#include "timer.h"
#include "transport.h"
#include <assert.h>
#include <poll.h>
#include <sched.h>
@@ -25,409 +19,467 @@
#include <sys/types.h>
#include <unistd.h>
#include "common/ErrCode.hpp"
#include "common/ProcessIsolatedTestRunner.hpp"
#include "profiler.h"
#include "proxy.h"
#include "timer.h"
#include "transport.h"
#define NCCL_MAX_OPS (2048)
#define OP_INDEX(op) ((op) ? (op) - state->pools->elems : -1)
#define OP_SEEN 0x100000
ncclResult_t getOpIndex(struct ncclProxyArgs *op,
struct ncclProxyProgressState *state, int *poolIndex,
int *opIndex);
ncclResult_t dumpProxyState(struct ncclProxyProgressState *state);
ncclResult_t printProxyOp(struct ncclProxyArgs *op, int poolIndex, int opIndex);
ncclResult_t dumpProxyState(struct ncclProxyProgressState *state);
ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm *comm,
struct ncclProxyConnector *proxyConn,
int type, void *reqBuff, int reqSize,
void *respBuff, int respSize, int *reqFd,
int *respFd);
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm *comm, int proxyRank,
void *handle, int *convertedFd);
ncclResult_t
ncclProxyClientQueryFdBlocking(struct ncclComm *comm,
struct ncclProxyConnector *proxyConn,
int localFd, int *rmtFd);
ncclResult_t getOpIndex(
struct ncclProxyArgs* op, struct ncclProxyProgressState* state, int* poolIndex, int* opIndex
);
ncclResult_t dumpProxyState(struct ncclProxyProgressState* state);
ncclResult_t printProxyOp(struct ncclProxyArgs* op, int poolIndex, int opIndex);
ncclResult_t dumpProxyState(struct ncclProxyProgressState* state);
ncclResult_t ncclProxyCallBlockingUDS(
struct ncclComm* comm,
struct ncclProxyConnector* proxyConn,
int type,
void* reqBuff,
int reqSize,
void* respBuff,
int respSize,
int* reqFd,
int* respFd
);
ncclResult_t ncclProxyClientGetFdBlocking(
struct ncclComm* comm, int proxyRank, void* handle, int* convertedFd
);
ncclResult_t ncclProxyClientQueryFdBlocking(
struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int localFd, int* rmtFd
);
void ncclDumpProxyState(int signal);
#define PROXYARGS_ALLOCATE_SIZE NCCL_MAX_OPS
struct ncclProxyPool {
struct ncclProxyPool *next;
struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
struct ncclProxyPool
{
struct ncclProxyPool* next;
struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
};
void init_ncclProxyArgs_struct(ncclProxyArgs *pool_ptr) {
// init pool_ptr
pool_ptr->send = 2;
pool_ptr->nextRank = 4;
pool_ptr->prevRank = 5;
pool_ptr->pattern = ncclPatternRing;
pool_ptr->nsubs = 1;
pool_ptr->state = ncclProxyOpNone;
pool_ptr->retry_total = 2;
void init_ncclProxyArgs_struct(ncclProxyArgs* pool_ptr)
{
// init pool_ptr
pool_ptr->send = 2;
pool_ptr->nextRank = 4;
pool_ptr->prevRank = 5;
pool_ptr->pattern = ncclPatternRing;
pool_ptr->nsubs = 1;
pool_ptr->state = ncclProxyOpNone;
pool_ptr->retry_total = 2;
}
namespace RcclUnitTesting {
TEST(ProxyTests,
getOpIndex) { // Tests what is the index of the pool being passed within
// the known valid pools in state ptr
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n");
// Init Dummy structs
struct ncclProxyArgs *pool_ptr = new ncclProxyArgs;
struct ncclProxyPool *pools_ptr = new ncclProxyPool;
struct ncclProxyPool *pools2_ptr = new ncclProxyPool;
struct ncclProxyProgressState *state_ptr = new ncclProxyProgressState;
namespace RcclUnitTesting
{
TEST(ProxyTests, getOpIndex)
{ // Tests what is the index of the pool being passed within
// the known valid pools in state ptr
INFO("[ProxyTests] Test Start \n");
// state_ptr = &state;
state_ptr->active = &pools_ptr->elems[1]; // chk
state_ptr->pool = pool_ptr;
state_ptr->pools = pools_ptr;
// Init Dummy structs
struct ncclProxyArgs* pool_ptr = new ncclProxyArgs;
struct ncclProxyPool* pools_ptr = new ncclProxyPool;
struct ncclProxyPool* pools2_ptr = new ncclProxyPool;
struct ncclProxyProgressState* state_ptr = new ncclProxyProgressState;
pools_ptr->next = pools2_ptr;
// state_ptr = &state;
state_ptr->active = &pools_ptr->elems[1]; // chk
state_ptr->pool = pool_ptr;
state_ptr->pools = pools_ptr;
struct ncclProxyArgs *x =
&pools_ptr->elems[5]; // Passing the 5th element of the pool
struct ncclProxyProgressState *y = state_ptr;
y->pools->next = y->pools; // next points to self
pools_ptr->next = pools2_ptr;
INFO(NCCL_LOG_INFO, "[ProxyTests] x=%u y->pools=%u x-y=%u \n", x,
y->pools->elems, x - y->pools->elems);
struct ncclProxyArgs* x = &pools_ptr->elems[5]; // Passing the 5th element of the pool
struct ncclProxyProgressState* y = state_ptr;
y->pools->next = y->pools; // next points to self
int pool_idx, opIndex;
ncclResult_t res = getOpIndex(x, y, &pool_idx, &opIndex);
INFO(
"[ProxyTests] x=%p y->pools=%p x-y=%ld \n",
(void*)x,
(void*)y->pools->elems,
x - y->pools->elems
);
ASSERT_EQ(pool_idx, 0);
ASSERT_EQ(opIndex, 5);
int pool_idx, opIndex;
ncclResult_t res = getOpIndex(x, y, &pool_idx, &opIndex);
INFO(NCCL_LOG_INFO, "[ProxyTests] pool_idx %d opIndex %d \n", pool_idx,
opIndex);
INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", res);
assert(res == ncclSuccess);
ASSERT_EQ(pool_idx, 0);
ASSERT_EQ(opIndex, 5);
INFO("[ProxyTests] pool_idx %d opIndex %d \n", pool_idx, opIndex);
INFO("[ProxyTests] res %u \n", res);
assert(res == ncclSuccess);
delete pool_ptr;
delete pools_ptr;
delete pools2_ptr;
delete state_ptr;
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n");
INFO("[ProxyTests] Test Complete \n");
}
TEST(ProxyTests, printProxyOp) {
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n");
// Init Dummy structs
TEST(ProxyTests, printProxyOp)
{
INFO("[ProxyTests] Test Start \n");
// Init Dummy structs
struct ncclProxyArgs *pool_ptr = new ncclProxyArgs;
struct ncclProxyArgs* pool_ptr = new ncclProxyArgs;
struct ncclProxyPool *pools_ptr = new ncclProxyPool;
struct ncclProxyPool *pools2_ptr = new ncclProxyPool;
struct ncclProxyPool* pools_ptr = new ncclProxyPool;
struct ncclProxyPool* pools2_ptr = new ncclProxyPool;
struct ncclProxyProgressState *state_ptr = new ncclProxyProgressState;
struct ncclProxyProgressState* state_ptr = new ncclProxyProgressState;
// state_ptr = &state;
state_ptr->active = &pools_ptr->elems[1]; // chk
state_ptr->pool = pool_ptr;
state_ptr->pools = pools_ptr;
// state_ptr = &state;
state_ptr->active = &pools_ptr->elems[1]; // chk
state_ptr->pool = pool_ptr;
state_ptr->pools = pools_ptr;
pools_ptr->next = pools2_ptr;
pools_ptr->next = pools2_ptr;
struct ncclProxyArgs *x = &pools_ptr->elems[5];
struct ncclProxyProgressState *y = state_ptr;
y->pools->next = y->pools; // next points to self
struct ncclProxyArgs* x = &pools_ptr->elems[5];
struct ncclProxyProgressState* y = state_ptr;
y->pools->next = y->pools; // next points to self
INFO(NCCL_LOG_INFO, "[ProxyTests] x=%u y->pools=%u x-y=%u \n", x,
y->pools->elems, x - y->pools->elems);
INFO(
"[ProxyTests] x=%p y->pools=%p x-y=%ld \n",
(void*)x,
(void*)y->pools->elems,
x - y->pools->elems
);
init_ncclProxyArgs_struct(pool_ptr);
init_ncclProxyArgs_struct(pool_ptr);
int pool_idx = 2, opIndex = 3; // random vals
ncclResult_t res = printProxyOp(pool_ptr, pool_idx, opIndex);
int pool_idx = 2, opIndex = 3; // random vals
ncclResult_t res = printProxyOp(pool_ptr, pool_idx, opIndex);
INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", res);
assert(res == ncclSuccess);
INFO("[ProxyTests] res %u \n", res);
assert(res == ncclSuccess);
delete pools_ptr;
delete pools2_ptr;
delete pool_ptr;
delete state_ptr;
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n");
INFO("[ProxyTests] Test Complete \n");
}
TEST(ProxyTests, dumpProxyState) {
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n");
TEST(ProxyTests, dumpProxyState)
{
INFO("[ProxyTests] Test Start \n");
// Init Dummy structs
struct ncclProxyArgs *pool_ptr;
struct ncclProxyPool *pools_ptr = new ncclProxyPool;
struct ncclProxyPool *pools2_ptr = new ncclProxyPool;
// Init Dummy structs
struct ncclProxyArgs* pool_ptr;
struct ncclProxyPool* pools_ptr = new ncclProxyPool;
struct ncclProxyPool* pools2_ptr = new ncclProxyPool;
struct ncclProxyProgressState *state_ptr = new ncclProxyProgressState;
struct ncclProxyProgressState* state_ptr = new ncclProxyProgressState;
state_ptr->active = &pools_ptr->elems[1];
pool_ptr = &pools_ptr->elems[4];
pool_ptr->next = NULL;
pool_ptr->nextPeer = NULL;
state_ptr->active = &pools_ptr->elems[1];
pool_ptr = &pools_ptr->elems[4];
pool_ptr->next = NULL;
pool_ptr->nextPeer = NULL;
state_ptr->pool = pool_ptr;
state_ptr->pool->next = NULL;
state_ptr->pool->nextPeer = NULL;
state_ptr->pool->state = OP_SEEN;
state_ptr->pools = pools_ptr;
state_ptr->pools->next = NULL;
state_ptr->pool = pool_ptr;
state_ptr->pool->next = NULL;
state_ptr->pool->nextPeer = NULL;
state_ptr->pool->state = OP_SEEN;
state_ptr->pools = pools_ptr;
state_ptr->pools->next = NULL;
struct ncclProxyArgs *op = state_ptr->active;
op->state = OP_SEEN;
op->nextPeer = NULL;
op->next = NULL;
struct ncclProxyArgs* op = state_ptr->active;
op->state = OP_SEEN;
op->nextPeer = NULL;
op->next = NULL;
pools_ptr->next = NULL;
pools_ptr->next = NULL;
init_ncclProxyArgs_struct(pool_ptr);
init_ncclProxyArgs_struct(pool_ptr);
int pool_idx = 2, opIndex = 3; // random vals
ncclResult_t res = dumpProxyState(state_ptr);
int pool_idx = 2, opIndex = 3; // random vals
ncclResult_t res = dumpProxyState(state_ptr);
INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", res);
ASSERT_EQ(res, ncclSuccess);
INFO("[ProxyTests] res %u \n", res);
ASSERT_EQ(res, ncclSuccess);
delete pools_ptr;
delete pools2_ptr;
delete state_ptr;
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n");
INFO("[ProxyTests] Test Complete \n");
}
TEST(ProxyTests, ncclProxyCallBlockingUDS) {
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n");
TEST(ProxyTests, ncclProxyCallBlockingUDS)
{
INFO("[ProxyTests] Test Start \n");
// Init Dummy structs
struct ncclComm *comm = new ncclComm;
int *arr = new int[100];
for (int i = 0; i < 100; i++) {
arr[i] = i;
}
// Init Dummy structs
struct ncclComm* comm = new ncclComm;
int* arr = new int[100];
for(int i = 0; i < 100; i++)
{
arr[i] = i;
}
comm->topParentLocalRanks = arr;
comm->localRank = 10;
comm->topParentLocalRanks = arr;
comm->localRank = 10;
int *arr_x = new int[20];
for (int i = 0; i < 20; i++) {
arr_x[i] = i;
}
comm->topParentRanks = arr_x;
int* arr_x = new int[20];
for(int i = 0; i < 20; i++)
{
arr_x[i] = i;
}
comm->topParentRanks = arr_x;
struct ncclProxyState *sharedProxyState = new ncclProxyState;
uint64_t *arr2 = new uint64_t[10];
for (int i = 0; i < 10; i++) {
arr2[i] = 122567 + i; // random
}
struct ncclProxyState* sharedProxyState = new ncclProxyState;
uint64_t* arr2 = new uint64_t[10];
for(int i = 0; i < 10; i++)
{
arr2[i] = 122567 + i; // random
}
INFO(NCCL_LOG_INFO, "[ProxyTests] sizeof(ncclProxyConnector) = %u\n",
sizeof(ncclProxyConnector));
struct ncclProxyConnector *proxyConn =
new (std::nothrow) ncclProxyConnector[20];
if (proxyConn == nullptr) {
// Handle allocation failure
INFO(NCCL_LOG_INFO, "[ProxyTests] Allocation failed\n");
ASSERT_NE(proxyConn, nullptr);
}
INFO("[ProxyTests] sizeof(ncclProxyConnector) = %zu\n", sizeof(ncclProxyConnector));
struct ncclProxyConnector* proxyConn = new(std::nothrow) ncclProxyConnector[20];
if(proxyConn == nullptr)
{
// Handle allocation failure
INFO("[ProxyTests] Allocation failed\n");
ASSERT_NE(proxyConn, nullptr);
}
proxyConn->tpRank = 2;
proxyConn->tpRank = 2;
comm->proxyState = sharedProxyState;
comm->proxyState = sharedProxyState;
comm->proxyState->peerAddressesUDS = arr2;
comm->proxyState->peerAddressesUDS = arr2;
comm->abortFlag = NULL;
comm->abortFlag = NULL;
int rank = comm->topParentLocalRanks[comm->localRank];
INFO(NCCL_LOG_INFO, "[ProxyTests] rank %d\n", rank);
uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank];
INFO(NCCL_LOG_INFO, "[ProxyTests] pidHash %u \n", pidHash);
int rank = comm->topParentLocalRanks[comm->localRank];
INFO("[ProxyTests] rank %d\n", rank);
uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank];
INFO("[ProxyTests] pidHash %lu \n", pidHash);
int type = ncclProxyMsgGetFd;
// some memory on stack for storing request and response buffers
uint64_t *x_mem = new uint64_t[10];
uint64_t *x_mem2 = new uint64_t[10];
void *reqBuff = (void *)x_mem;
int reqSize = sizeof(uint64_t) * 5;
void *respBuff = NULL;
int respSize = 0;
int *reqFd = NULL;
int *respFd = (int *)x_mem2;
int type = ncclProxyMsgGetFd;
// some memory on stack for storing request and response buffers
uint64_t* x_mem = new uint64_t[10];
uint64_t* x_mem2 = new uint64_t[10];
void* reqBuff = (void*)x_mem;
int reqSize = sizeof(uint64_t) * 5;
void* respBuff = NULL;
int respSize = 0;
int* reqFd = NULL;
int* respFd = (int*)x_mem2;
ncclResult_t res =
ncclProxyCallBlockingUDS(comm, proxyConn, type, reqBuff, reqSize,
respBuff, respSize, reqFd, respFd);
ncclResult_t res = ncclProxyCallBlockingUDS(
comm,
proxyConn,
type,
reqBuff,
reqSize,
respBuff,
respSize,
reqFd,
respFd
);
bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError);
INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", bool_res);
ASSERT_EQ(bool_res, true);
bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError);
INFO("[ProxyTests] res %u \n", bool_res);
ASSERT_EQ(bool_res, true);
delete comm;
delete sharedProxyState;
delete proxyConn;
delete[] proxyConn;
delete[] arr_x;
delete[] arr;
delete[] arr2;
delete[] x_mem;
delete[] x_mem2;
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n");
INFO("[ProxyTests] Test Complete \n");
}
TEST(ProxyTests, ncclProxyClientGetFdBlocking) {
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n");
TEST(ProxyTests, ncclProxyClientGetFdBlocking)
{
RUN_ISOLATED_TEST(
"ncclProxyClientGetFdBlocking",
[]()
{
INFO("[ProxyTests] Test Start \n");
// Init Dummy structs
struct ncclComm *comm = new ncclComm;
int *arr = new int[100];
for (int i = 0; i < 100; i++) {
arr[i] = i;
}
// Init Dummy structs
struct ncclComm* comm = new ncclComm;
int* arr = new int[100];
for(int i = 0; i < 100; i++)
{
arr[i] = i;
}
comm->topParentLocalRanks = arr;
comm->localRank = 10;
struct ncclProxyState *sharedProxyState = new ncclProxyState;
comm->topParentLocalRanks = arr;
comm->localRank = 10;
struct ncclProxyState* sharedProxyState = new ncclProxyState;
int *arr_x = new int[20];
for (int i = 0; i < 20; i++) {
arr_x[i] = i;
}
comm->topParentRanks = arr_x;
int* arr_x = new int[20];
for(int i = 0; i < 20; i++)
{
arr_x[i] = i;
}
comm->topParentRanks = arr_x;
uint64_t *arr2 = new uint64_t[10];
for (int i = 0; i < 10; i++) {
arr2[i] = 122567 + i; // random
}
uint64_t* arr2 = new uint64_t[10];
for(int i = 0; i < 10; i++)
{
arr2[i] = 122567 + i; // random
}
struct ncclProxyConnector *proxyConn =
new (std::nothrow) ncclProxyConnector[20];
if (proxyConn == nullptr) {
// Handle allocation failure
INFO(NCCL_LOG_INFO, "[ProxyTests] Allocation failed\n");
ASSERT_NE(proxyConn, nullptr);
}
struct ncclProxyConnector* proxyConn = new(std::nothrow) ncclProxyConnector[20];
if(proxyConn == nullptr)
{
// Handle allocation failure
INFO("[ProxyTests] Allocation failed\n");
ASSERT_NE(proxyConn, nullptr);
}
proxyConn->tpRank = 2;
comm->proxyState = sharedProxyState;
comm->proxyState->peerAddressesUDS = arr2;
comm->abortFlag = NULL;
proxyConn->tpRank = 2;
comm->proxyState = sharedProxyState;
comm->proxyState->peerAddressesUDS = arr2;
comm->abortFlag = NULL;
int rank = comm->topParentLocalRanks[comm->localRank];
INFO(NCCL_LOG_INFO, "[ProxyTests] rank %d\n", rank);
uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank];
INFO(NCCL_LOG_INFO, "[ProxyTests] pidHash %u \n", pidHash);
int rank = comm->topParentLocalRanks[comm->localRank];
INFO("[ProxyTests] rank %d\n", rank);
uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank];
INFO("[ProxyTests] pidHash %lu \n", pidHash);
int type = ncclProxyMsgGetFd;
// some memory on stack for storing request and response buffers
uint64_t *x_mem = new uint64_t[10];
uint64_t *x_mem2 = new uint64_t[10];
void *reqBuff = (void *)x_mem;
int reqSize = sizeof(uint64_t) * 5;
void *respBuff = NULL;
int respSize = 0;
int *reqFd = NULL;
int *respFd = (int *)x_mem2;
int type = ncclProxyMsgGetFd;
// some memory on stack for storing request and response buffers
uint64_t* x_mem = new uint64_t[10];
uint64_t* x_mem2 = new uint64_t[10];
void* reqBuff = (void*)x_mem;
int reqSize = sizeof(uint64_t) * 5;
void* respBuff = NULL;
int respSize = 0;
int* reqFd = NULL;
int* respFd = (int*)x_mem2;
comm->gproxyConn = proxyConn;
comm->gproxyConn[rank].initialized = true;
comm->gproxyConn = proxyConn;
comm->gproxyConn[rank].initialized = true;
ncclResult_t res = ncclProxyClientGetFdBlocking(comm, rank, reqBuff, respFd);
ncclResult_t res = ncclProxyClientGetFdBlocking(comm, rank, reqBuff, respFd);
bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError);
INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", bool_res);
ASSERT_EQ(bool_res, true);
bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError);
INFO("[ProxyTests] res %u \n", bool_res);
ASSERT_EQ(bool_res, true);
delete comm;
delete sharedProxyState;
delete proxyConn;
delete[] arr_x;
delete[] arr;
delete[] arr2;
delete[] x_mem;
delete[] x_mem2;
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n");
delete comm;
delete sharedProxyState;
delete[] proxyConn;
delete[] arr_x;
delete[] arr;
delete[] arr2;
delete[] x_mem;
delete[] x_mem2;
INFO("[ProxyTests] Test Complete \n");
INFO("Test 'ncclProxyClientGetFdBlocking' PASSED\n");
}
);
}
TEST(ProxyTests, ncclProxyClientQueryFdBlocking) {
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n");
TEST(ProxyTests, ncclProxyClientQueryFdBlocking)
{
RUN_ISOLATED_TEST(
"ncclProxyClientQueryFdBlocking",
[]()
{
INFO("[ProxyTests] Test Start \n");
// Init Dummy structs
struct ncclComm *comm = new ncclComm;
int *arr = new int[100];
for (int i = 0; i < 5; i++) {
arr[i] = i;
}
// Init Dummy structs
struct ncclComm* comm = new ncclComm;
int* arr = new int[100];
for(int i = 0; i < 5; i++)
{
arr[i] = i;
}
comm->topParentLocalRanks = arr;
comm->localRank = 0;
comm->topParentLocalRanks = arr;
comm->localRank = 0;
int *arr_x = new int[20];
for (int i = 0; i < 20; i++) {
arr_x[i] = i;
}
comm->topParentRanks = arr_x;
int* arr_x = new int[20];
for(int i = 0; i < 20; i++)
{
arr_x[i] = i;
}
comm->topParentRanks = arr_x;
struct ncclProxyState *sharedProxyState = new ncclProxyState;
struct ncclProxyState* sharedProxyState = new ncclProxyState;
uint64_t *arr2 = new uint64_t[10];
for (int i = 0; i < 10; i++) {
arr2[i] = 122567 + i; // random
}
uint64_t* arr2 = new uint64_t[10];
for(int i = 0; i < 10; i++)
{
arr2[i] = 122567 + i; // random
}
struct ncclProxyConnector *proxyConn =
new (std::nothrow) ncclProxyConnector[20];
if (proxyConn == nullptr) {
// Handle allocation failure
INFO(NCCL_LOG_INFO, "[ProxyTests] Allocation failed\n");
ASSERT_NE(proxyConn, nullptr);
}
struct ncclProxyConnector* proxyConn = new(std::nothrow) ncclProxyConnector[20];
if(proxyConn == nullptr)
{
// Handle allocation failure
INFO("[ProxyTests] Allocation failed\n");
ASSERT_NE(proxyConn, nullptr);
}
proxyConn->tpRank = 2;
proxyConn->tpRank = 2;
comm->proxyState = sharedProxyState;
comm->proxyState = sharedProxyState;
comm->proxyState->peerAddressesUDS = arr2;
comm->proxyState->peerAddressesUDS = arr2;
comm->abortFlag = NULL;
comm->abortFlag = NULL;
int rank = comm->topParentLocalRanks[comm->localRank];
INFO(NCCL_LOG_INFO, "[ProxyTests] rank %d\n", rank);
uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank];
INFO(NCCL_LOG_INFO, "[ProxyTests] pidHash %u \n", pidHash);
int rank = comm->topParentLocalRanks[comm->localRank];
INFO("[ProxyTests] rank %d\n", rank);
uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank];
INFO("[ProxyTests] pidHash %lu \n", pidHash);
int type = ncclProxyMsgGetFd;
// some memory on stack for storing request and response buffers
uint64_t *x_mem = new uint64_t[10];
uint64_t *x_mem2 = new uint64_t[10];
void *reqBuff = (void *)x_mem;
int reqSize = sizeof(uint64_t) * 5;
void *respBuff = NULL;
int respSize = 0;
int *reqFd = NULL;
int *respFd = (int *)x_mem2;
int type = ncclProxyMsgGetFd;
// some memory on stack for storing request and response buffers
uint64_t* x_mem = new uint64_t[10];
uint64_t* x_mem2 = new uint64_t[10];
void* reqBuff = (void*)x_mem;
int reqSize = sizeof(uint64_t) * 5;
void* respBuff = NULL;
int respSize = 0;
int* reqFd = NULL;
int* respFd = (int*)x_mem2;
comm->gproxyConn = proxyConn;
comm->gproxyConn[rank].initialized = true;
comm->gproxyConn = proxyConn;
comm->gproxyConn[rank].initialized = true;
int localFd = 0;
int dummy_int = 20;
respBuff = &dummy_int;
ncclResult_t res =
ncclProxyClientQueryFdBlocking(comm, proxyConn, localFd, (int *)respBuff);
int localFd = 0;
int dummy_int = 20;
respBuff = &dummy_int;
ncclResult_t res
= ncclProxyClientQueryFdBlocking(comm, proxyConn, localFd, (int*)respBuff);
bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError);
INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", bool_res);
ASSERT_EQ(bool_res, true);
bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError);
INFO("[ProxyTests] res %u \n", bool_res);
ASSERT_EQ(bool_res, true);
delete comm;
delete sharedProxyState;
delete proxyConn;
delete[] arr_x;
delete[] arr;
delete[] arr2;
delete[] x_mem;
delete[] x_mem2;
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n");
delete comm;
delete sharedProxyState;
delete[] proxyConn;
delete[] arr_x;
delete[] arr;
delete[] arr2;
delete[] x_mem;
delete[] x_mem2;
INFO("[ProxyTests] Test Complete \n");
INFO("Test 'ncclProxyClientQueryFdBlocking' PASSED\n");
}
);
}
} // namespace RcclUnitTesting
+1 -1
Просмотреть файл
@@ -14,7 +14,7 @@ The RCCL test suite provides following frameworks along with the existing rccl-U
## Testing Frameworks
Following are two new complementary testing frameworks for different testing needs:
Following is a new testing framework for running single node & single process test in isolation:
### 1. Process Isolated Test Runner
Run tests in isolated processes with clean environment settings.
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+696
Просмотреть файл
@@ -0,0 +1,696 @@
/*************************************************************************
* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "ProcessIsolatedTestRunner.hpp"
#include <errno.h>
#include <fcntl.h>
#include <gtest/gtest.h>
#include <unistd.h>
#include <atomic>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <thread>
#include "ErrCode.hpp"
namespace RcclUnitTesting
{
// Exit codes for test process results
enum RcclTestCode
{
RCCL_TEST_INVALID = -1,
RCCL_TEST_SUCCESS = 0,
RCCL_TEST_FAILURE = 1,
RCCL_TEST_UNKNOWN_EXCEPTION = 2,
RCCL_TEST_TIMEOUT = 3,
RCCL_TEST_SKIPPED = 4
};
// Define static members
std::mutex ProcessIsolatedTestRunner::testConfigsMutex_;
std::vector<ProcessIsolatedTestRunner::TestConfig> ProcessIsolatedTestRunner::testConfigs_;
std::mutex ProcessIsolatedTestRunner::resultsMutex_;
std::vector<ProcessIsolatedTestRunner::TestResult> ProcessIsolatedTestRunner::testResults_;
// TestResult implementation
ProcessIsolatedTestRunner::TestResult::TestResult()
: passed(false), skipped(false), exitCode(-1), processId(-1), duration(0)
{}
// TestConfig implementation
ProcessIsolatedTestRunner::TestConfig::TestConfig(
const std::string& testName, std::function<void()> logic
)
: name(testName), testLogic(logic), timeout(30), inheritParentEnv(true)
{}
ProcessIsolatedTestRunner::TestConfig& ProcessIsolatedTestRunner::TestConfig::withEnvironment(
const std::unordered_map<std::string, std::string>& env
)
{
environmentVariables = env;
return *this;
}
ProcessIsolatedTestRunner::TestConfig&
ProcessIsolatedTestRunner::TestConfig::withTimeout(std::chrono::seconds timeoutSeconds)
{
timeout = timeoutSeconds;
return *this;
}
ProcessIsolatedTestRunner::TestConfig&
ProcessIsolatedTestRunner::TestConfig::withCleanEnvironment(bool inherit)
{
inheritParentEnv = inherit;
return *this;
}
ProcessIsolatedTestRunner::TestConfig&
ProcessIsolatedTestRunner::TestConfig::clearVariable(const std::string& varName)
{
clearEnvVars.push_back(varName);
return *this;
}
ProcessIsolatedTestRunner::TestConfig& ProcessIsolatedTestRunner::TestConfig::setVariable(
const std::string& name, const std::string& value
)
{
environmentVariables[name] = value;
return *this;
}
// ExecutionOptions implementation
ProcessIsolatedTestRunner::ExecutionOptions::ExecutionOptions()
: stopOnFirstFailure(false), verboseLogging(true)
{}
// Apply environment variables to current process
void ProcessIsolatedTestRunner::applyEnvironmentVariables(const TestConfig& config)
{
// Clear specified environment variables first
for(const auto& varName : config.clearEnvVars)
{
unsetenv(varName.c_str());
}
// If not inheriting parent environment, clear all environment variables
if(!config.inheritParentEnv)
{
// Clear all existing environment variables
if(clearenv() != 0)
{
std::cerr << "Warning: Failed to clear environment variables" << std::endl;
}
// Set only the specified variables
for(const auto& [name, value] : config.environmentVariables)
{
setenv(name.c_str(), value.c_str(), 1);
}
}
else
{
// Just set/override the specified variables
for(const auto& [name, value] : config.environmentVariables)
{
setenv(name.c_str(), value.c_str(), 1);
}
}
}
// Execute a single test in a separate process
int ProcessIsolatedTestRunner::runTestInProcess(const TestConfig& config)
{
pid_t processId = getpid();
if(config.name.empty())
{
std::cerr << "Error: Test name is empty for process " << processId << std::endl;
return RCCL_TEST_FAILURE;
}
try
{
// Apply environment variables
applyEnvironmentVariables(config);
// Thread-safe test execution with timeout protection
std::atomic<bool> testCompleted{false};
std::exception_ptr testException = nullptr;
bool testPassed = true;
bool testSkipped = false;
// Run test in a separate thread to allow timeout handling
std::thread testThread(
[&]()
{
try
{
// Get initial test state
const ::testing::UnitTest* unitTest = ::testing::UnitTest::GetInstance();
size_t initialFailureCount = unitTest->failed_test_count();
size_t initialSkippedCount = unitTest->skipped_test_count();
// Execute the test logic
config.testLogic();
// Check if any new test failures occurred
size_t finalFailureCount = unitTest->failed_test_count();
size_t finalSkippedCount = unitTest->skipped_test_count();
testPassed = (finalFailureCount == initialFailureCount);
testSkipped = (finalSkippedCount > initialSkippedCount);
testCompleted = true;
}
catch(...)
{
testException = std::current_exception();
testPassed = false;
testCompleted = true;
}
}
);
// Wait for test completion with timeout
auto start = std::chrono::steady_clock::now();
const auto timeout = config.timeout;
while(!testCompleted.load())
{
std::this_thread::sleep_for(std::chrono::milliseconds(100));
if(std::chrono::steady_clock::now() - start > timeout)
{
// Test timed out
INFO(
"Test '%s' TIMED OUT after %ld seconds\n",
config.name.c_str(),
timeout.count()
);
fflush(NULL);
testThread.detach();
return RCCL_TEST_TIMEOUT;
}
}
// Wait for thread completion
if(testThread.joinable())
{
testThread.join();
}
// Check if test threw an exception
if(testException)
{
std::rethrow_exception(testException);
}
// Flush output before returning (needed before _exit())
fflush(NULL);
// Return appropriate exit code based on test result
if(testSkipped)
{
return RCCL_TEST_SKIPPED;
}
else if(testPassed)
{
return RCCL_TEST_SUCCESS;
}
else
{
return RCCL_TEST_FAILURE;
}
}
catch(const std::exception& e)
{
INFO("Test '%s' FAILED with exception: %s\n", config.name.c_str(), e.what());
std::cerr << "Exception in test '" << config.name << "': " << e.what() << std::endl;
fflush(NULL);
return RCCL_TEST_FAILURE;
}
catch(...)
{
INFO("Test '%s' FAILED with unknown exception\n", config.name.c_str());
std::cerr << "Unknown exception in test '" << config.name << "'" << std::endl;
fflush(NULL);
return RCCL_TEST_UNKNOWN_EXCEPTION;
}
}
// Register a test configuration
void ProcessIsolatedTestRunner::registerTest(const TestConfig& config)
{
std::lock_guard<std::mutex> lock(testConfigsMutex_);
testConfigs_.push_back(config);
}
// Register a simple test with just name and logic
void ProcessIsolatedTestRunner::registerTest(
const std::string& name, std::function<void()> testLogic
)
{
registerTest(TestConfig(name, testLogic));
}
// Register a test with environment variables
void ProcessIsolatedTestRunner::registerTest(
const std::string& name,
std::function<void()> testLogic,
const std::unordered_map<std::string, std::string>& env
)
{
registerTest(TestConfig(name, testLogic).withEnvironment(env));
}
// Record test result (thread-safe)
void ProcessIsolatedTestRunner::recordTestResult(const TestResult& result)
{
std::lock_guard<std::mutex> lock(resultsMutex_);
testResults_.push_back(result);
}
// Helper method: Create pipes for capturing process output
bool ProcessIsolatedTestRunner::createOutputPipes(int stdoutPipe[2], int stderrPipe[2])
{
// Create pipes for stdout and stderr
// stdoutPipe[0] = read end, stdoutPipe[1] = write end
if(pipe(stdoutPipe) == -1)
{
std::cerr << "Failed to create stdout pipe: " << strerror(errno) << std::endl;
return false;
}
if(pipe(stderrPipe) == -1)
{
std::cerr << "Failed to create stderr pipe: " << strerror(errno) << std::endl;
close(stdoutPipe[0]);
close(stdoutPipe[1]);
return false;
}
return true;
}
// Helper method: Redirect child process output to pipes
void ProcessIsolatedTestRunner::redirectOutputToPipes(int stdoutPipe[2], int stderrPipe[2])
{
// Close read ends of pipes in child process (not needed)
close(stdoutPipe[0]);
close(stderrPipe[0]);
// Redirect stdout and stderr to write ends of pipes
dup2(stdoutPipe[1], STDOUT_FILENO);
dup2(stderrPipe[1], STDERR_FILENO);
// Close the original write end file descriptors after duplication
// The duplicated descriptors (STDOUT_FILENO, STDERR_FILENO) will be closed by _exit()
close(stdoutPipe[1]);
close(stderrPipe[1]);
}
// Helper method: Capture output from child process pipes
ProcessIsolatedTestRunner::CapturedOutput ProcessIsolatedTestRunner::captureProcessOutput(
int stdoutPipe[2], int stderrPipe[2], pid_t pid, int* status
)
{
// Close write ends of pipes in parent process (not needed)
close(stdoutPipe[1]);
close(stderrPipe[1]);
CapturedOutput output;
char buffer[4096];
ssize_t count;
// Read from stdout pipe
while((count = read(stdoutPipe[0], buffer, sizeof(buffer) - 1)) > 0)
{
buffer[count] = '\0';
output.stdoutContent += buffer;
}
close(stdoutPipe[0]);
// Read from stderr pipe
while((count = read(stderrPipe[0], buffer, sizeof(buffer) - 1)) > 0)
{
buffer[count] = '\0';
output.stderrContent += buffer;
}
close(stderrPipe[0]);
// Wait for child to exit (blocking)
waitpid(pid, status, 0);
return output;
}
// Helper method: Display captured output
void ProcessIsolatedTestRunner::displayCapturedOutput(
const CapturedOutput& output, const std::string& testName
)
{
if(!output.stdoutContent.empty())
{
std::cout << output.stdoutContent;
if(output.stdoutContent.back() != '\n')
std::cout << '\n';
}
if(!output.stderrContent.empty())
{
std::cerr << output.stderrContent;
if(output.stderrContent.back() != '\n')
std::cerr << '\n';
}
}
// Execute all registered tests (simplified sequential execution only)
bool ProcessIsolatedTestRunner::executeAllTests(const ExecutionOptions& options)
{
// Get test configurations to run
std::vector<TestConfig> testsToRun;
{
std::lock_guard<std::mutex> lock(testConfigsMutex_);
testsToRun = testConfigs_;
}
// Clear previous results
{
std::lock_guard<std::mutex> lock(resultsMutex_);
testResults_.clear();
}
// Sequential execution
for(const auto& testConfig : testsToRun)
{
auto startTime = std::chrono::steady_clock::now();
int stdout_fd[2], stderr_fd[2];
if(!createOutputPipes(stdout_fd, stderr_fd))
{
std::cerr << "Failed to create output files for test '" << testConfig.name << "'"
<< std::endl;
continue;
}
pid_t pid = fork();
if(pid == 0)
{
redirectOutputToPipes(stdout_fd, stderr_fd);
int result = runTestInProcess(testConfig);
// Use _exit() instead of exit() to avoid atexit handlers
// This prevents GPU runtime cleanup issues after fork
_exit(result);
}
else if(pid > 0)
{
// Log test start with environment variables if any
if(!testConfig.environmentVariables.empty())
{
std::string envVars;
for(const auto& [name, value] : testConfig.environmentVariables)
{
if(!envVars.empty())
envVars += ", ";
envVars += name + "=" + value;
}
INFO(
"Running isolated test '%s' (PID: %d) with env: %s\n",
testConfig.name.c_str(),
pid,
envVars.c_str()
);
}
else
{
INFO("Running isolated test '%s' (PID: %d)\n", testConfig.name.c_str(), pid);
}
int status;
CapturedOutput output = captureProcessOutput(stdout_fd, stderr_fd, pid, &status);
auto endTime = std::chrono::steady_clock::now();
auto duration
= std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime);
TestResult testResult;
testResult.testName = testConfig.name;
testResult.processId = pid;
testResult.duration = duration;
if(WIFEXITED(status))
{
int exitCode = WEXITSTATUS(status);
testResult.exitCode = exitCode;
testResult.passed = (exitCode == RCCL_TEST_SUCCESS);
testResult.skipped = (exitCode == RCCL_TEST_SKIPPED);
if(exitCode == RCCL_TEST_SUCCESS)
{
INFO("Test '%s' PASSED (%ld ms)\n", testConfig.name.c_str(), duration.count());
}
else if(exitCode == RCCL_TEST_TIMEOUT)
{
INFO(
"Test '%s' (PID: %d) TIMED OUT after %ld ms\n",
testConfig.name.c_str(),
pid,
duration.count()
);
testResult.errorMessage = "Test timed out";
}
else if(exitCode == RCCL_TEST_SKIPPED)
{
INFO(
"Test '%s' (PID: %d) SKIPPED in %ld ms\n",
testConfig.name.c_str(),
pid,
duration.count()
);
testResult.errorMessage = "Test skipped";
}
else
{
INFO(
"Test '%s' (PID: %d) FAILED with exit code %d after %ld ms\n",
testConfig.name.c_str(),
pid,
exitCode,
duration.count()
);
testResult.errorMessage
= "Test failed with exit code " + std::to_string(exitCode);
}
}
else if(WIFSIGNALED(status))
{
int signal = WTERMSIG(status);
// Check if test reported success before signal termination
bool testPassed = (output.stdoutContent.find("PASSED") != std::string::npos);
if(testPassed)
{
// Test completed successfully before signal (e.g., GPU runtime cleanup)
testResult.passed = true;
testResult.skipped = false;
testResult.exitCode = RCCL_TEST_SUCCESS;
INFO("Test '%s' PASSED (%ld ms)\n", testConfig.name.c_str(), duration.count());
}
else
{
// Test terminated by signal before completion (crash)
testResult.passed = false;
testResult.skipped = false;
testResult.exitCode = -signal;
testResult.errorMessage = "Terminated by signal " + std::to_string(signal);
INFO(
"Test '%s' (PID: %d) terminated by signal %d after %ld ms\n",
testConfig.name.c_str(),
pid,
signal,
duration.count()
);
}
}
else
{
testResult.passed = false;
testResult.skipped = false;
testResult.exitCode = RCCL_TEST_INVALID;
testResult.errorMessage = "Failed to wait for process";
}
displayCapturedOutput(output, testConfig.name);
recordTestResult(testResult);
// Stop on first failure if requested
if(options.stopOnFirstFailure && !testResult.passed && !testResult.skipped)
{
break;
}
}
else
{
// Fork failed
TestResult testResult;
testResult.testName = testConfig.name;
testResult.passed = false;
testResult.skipped = false;
testResult.exitCode = RCCL_TEST_INVALID;
testResult.processId = RCCL_TEST_INVALID;
testResult.duration = std::chrono::milliseconds(0);
testResult.errorMessage = "Failed to fork process";
recordTestResult(testResult);
INFO("Failed to fork process for test '%s'\n", testConfig.name.c_str());
if(options.stopOnFirstFailure)
{
break;
}
}
}
bool result = generateReport(options);
// Automatically clear test configurations and results after execution
// This ensures a clean state for the next test suite without requiring
// explicit clear() calls from test cases
{
std::lock_guard<std::mutex> lock(testConfigsMutex_);
testConfigs_.clear();
}
{
std::lock_guard<std::mutex> lock(resultsMutex_);
testResults_.clear();
}
return result;
}
// Generate and display test report
bool ProcessIsolatedTestRunner::generateReport(const ExecutionOptions& options)
{
int totalTests = 0;
int passedTests = 0;
int failedTests = 0;
int skippedTests = 0;
std::chrono::milliseconds totalDuration{0};
{
std::lock_guard<std::mutex> lock(resultsMutex_);
totalTests = testResults_.size();
for(const auto& result : testResults_)
{
if(result.skipped)
{
skippedTests++;
}
else if(result.passed)
{
passedTests++;
}
else
{
failedTests++;
}
totalDuration += result.duration;
}
}
// Report summary only if there are failures or multiple tests
if(failedTests > 0 || totalTests > 1)
{
INFO(
"Process-Isolated Tests: %d passed, %d failed, %d skipped (%ld ms total)\n",
passedTests,
failedTests,
skippedTests,
totalDuration.count()
);
if(failedTests > 0)
{
std::lock_guard<std::mutex> lock(resultsMutex_);
for(const auto& result : testResults_)
{
if(!result.passed && !result.skipped)
{
INFO(
" Failed: %s - %s\n",
result.testName.c_str(),
result.errorMessage.c_str()
);
}
}
}
}
return failedTests == 0;
}
// Get detailed test results (thread-safe)
std::vector<ProcessIsolatedTestRunner::TestResult> ProcessIsolatedTestRunner::getTestResults()
{
std::lock_guard<std::mutex> lock(resultsMutex_);
return testResults_;
}
// Clear test registry and results (thread-safe)
void ProcessIsolatedTestRunner::clear()
{
size_t registeredCount = 0;
size_t executedCount = 0;
// Check for unexecuted tests before clearing
{
std::lock_guard<std::mutex> lock(testConfigsMutex_);
registeredCount = testConfigs_.size();
}
{
std::lock_guard<std::mutex> lock(resultsMutex_);
executedCount = testResults_.size();
}
// Warn if tests were registered but not all executed
if(registeredCount > 0 && executedCount < registeredCount)
{
std::cerr << "\n⚠️ WARNING: ProcessIsolatedTestRunner::clear() called with "
<< (registeredCount - executedCount) << " unexecuted test(s)!\n"
<< " Registered: " << registeredCount << " test(s)\n"
<< " Executed: " << executedCount << " test(s)\n"
<< " Did you forget to call executeAllTests()?\n"
<< std::endl;
}
// Clear the registrations and results
{
std::lock_guard<std::mutex> lock(testConfigsMutex_);
testConfigs_.clear();
}
{
std::lock_guard<std::mutex> lock(resultsMutex_);
testResults_.clear();
}
}
// Get number of registered tests
size_t ProcessIsolatedTestRunner::getTestCount()
{
std::lock_guard<std::mutex> lock(testConfigsMutex_);
return testConfigs_.size();
}
} // namespace RcclUnitTesting
+365
Просмотреть файл
@@ -0,0 +1,365 @@
/*************************************************************************
* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#pragma once
#include <sys/wait.h>
#include <unistd.h>
#include <chrono>
#include <cstdlib>
#include <functional>
#include <mutex>
#include <string>
#include <unordered_map>
#include <vector>
namespace RcclUnitTesting
{
/**
* @brief Generic thread-safe process isolated test runner
*
* This class provides a framework for running tests in isolated processes
* with clean environment settings and sequential execution.
*
*/
class ProcessIsolatedTestRunner
{
public:
/**
* @brief Test execution result structure
*/
struct TestResult
{
std::string testName; ///< Name of the test
bool passed; ///< Whether the test passed
bool skipped; ///< Whether the test skipped
int exitCode; ///< Process exit code
pid_t processId; ///< Process ID that ran the test
std::chrono::milliseconds duration; ///< Test execution duration
std::string errorMessage; ///< Error message if test failed
std::unordered_map<std::string, std::string> environment; ///< Environment variables used
/**
* @brief Default constructor
*/
TestResult();
};
/**
* @brief Test configuration structure
*/
struct TestConfig
{
std::string name; ///< Test name
std::function<void()> testLogic; ///< Test function to execute
std::unordered_map<std::string, std::string>
environmentVariables; ///< Environment variables to set
std::chrono::seconds timeout; ///< Test timeout
bool inheritParentEnv; ///< Whether to inherit parent environment
std::vector<std::string> clearEnvVars; ///< Environment variables to explicitly clear
/**
* @brief Constructor
* @param testName Name of the test
* @param logic Test function to execute
*/
TestConfig(const std::string& testName, std::function<void()> logic);
/**
* @brief Set environment variables for this test
* @param env Map of environment variable name-value pairs
* @return Reference to this TestConfig for method chaining
*/
TestConfig& withEnvironment(const std::unordered_map<std::string, std::string>& env);
/**
* @brief Set timeout for this test
* @param timeoutSeconds Timeout in seconds
* @return Reference to this TestConfig for method chaining
*/
TestConfig& withTimeout(std::chrono::seconds timeoutSeconds);
/**
* @brief Configure environment inheritance
* @param inherit Whether to inherit parent environment variables
* @return Reference to this TestConfig for method chaining
*/
TestConfig& withCleanEnvironment(bool inherit = false);
/**
* @brief Clear a specific environment variable
* @param varName Name of the variable to clear
* @return Reference to this TestConfig for method chaining
*/
TestConfig& clearVariable(const std::string& varName);
/**
* @brief Set a specific environment variable
* @param name Variable name
* @param value Variable value
* @return Reference to this TestConfig for method chaining
*/
TestConfig& setVariable(const std::string& name, const std::string& value);
};
/**
* @brief Execution options for test runner
*/
struct ExecutionOptions
{
bool stopOnFirstFailure; ///< Stop execution on first test failure
bool verboseLogging; ///< Enable verbose logging
/**
* @brief Default constructor with sensible defaults
*/
ExecutionOptions();
};
private:
/**
* @brief Structure to hold captured process output
*/
struct CapturedOutput
{
std::string stdoutContent; ///< Captured stdout content
std::string stderrContent; ///< Captured stderr content
};
// Thread-safe static members for test management
static std::mutex testConfigsMutex_;
static std::vector<TestConfig> testConfigs_;
static std::mutex resultsMutex_;
static std::vector<TestResult> testResults_;
/**
* @brief Apply environment variables to current process
* @param config Test configuration containing environment settings
*/
static void applyEnvironmentVariables(const TestConfig& config);
/**
* @brief Execute a single test in the child process
* @param config Test configuration
* @return Exit code (0 for success, non-zero for failure)
*/
static int runTestInProcess(const TestConfig& config);
/**
* @brief Create pipes for capturing process output
* @param stdoutPipe Array to hold stdout pipe file descriptors [read, write]
* @param stderrPipe Array to hold stderr pipe file descriptors [read, write]
* @return True if pipes were created successfully, false otherwise
*/
static bool createOutputPipes(int stdoutPipe[2], int stderrPipe[2]);
/**
* @brief Redirect child process output to pipes
* @param stdoutPipe Stdout pipe file descriptors [read, write]
* @param stderrPipe Stderr pipe file descriptors [read, write]
*/
static void redirectOutputToPipes(int stdoutPipe[2], int stderrPipe[2]);
/**
* @brief Capture output from child process via pipes
* @param stdoutPipe Stdout pipe file descriptors [read, write]
* @param stderrPipe Stderr pipe file descriptors [read, write]
* @param pid Child process ID to monitor
* @param status Pointer to status variable for waitpid
* @return Captured output from stdout and stderr
*/
static CapturedOutput
captureProcessOutput(int stdoutPipe[2], int stderrPipe[2], pid_t pid, int* status);
/**
* @brief Display captured output with formatted delimiters
* @param output Captured output to display
* @param testName Name of the test for context
*/
static void displayCapturedOutput(const CapturedOutput& output, const std::string& testName);
public:
/**
* @brief Register a test configuration
* @param config Complete test configuration
*/
static void registerTest(const TestConfig& config);
/**
* @brief Register a simple test with just name and logic
* @param name Test name
* @param testLogic Test function to execute
*/
static void registerTest(const std::string& name, std::function<void()> testLogic);
/**
* @brief Register a test with environment variables
* @param name Test name
* @param testLogic Test function to execute
* @param env Environment variables to set for this test
*/
static void registerTest(
const std::string& name,
std::function<void()> testLogic,
const std::unordered_map<std::string, std::string>& env
);
/**
* @brief Record a test result (thread-safe)
* @param result Test result to record
*/
static void recordTestResult(const TestResult& result);
/**
* @brief Execute all registered tests sequentially
* @param options Execution options (defaults to continue on failure)
* @return True if all tests passed, false otherwise
* @note This method automatically clears all test registrations and results
* after execution, ensuring a clean state for the next test suite.
*/
static bool executeAllTests(const ExecutionOptions& options = ExecutionOptions());
/**
* @brief Generate and display test report
* @param options Execution options used for the test run
* @return True if all tests passed, false otherwise
*/
static bool generateReport(const ExecutionOptions& options);
/**
* @brief Get detailed test results (thread-safe)
* @return Vector of all test results
*/
static std::vector<TestResult> getTestResults();
/**
* @brief Clear test registry and results (thread-safe)
* @note Calling this method manually is typically not necessary, as
* executeAllTests() automatically clears registrations after execution.
* This method is primarily useful for advanced use cases or when tests
* are registered but not executed.
*/
static void clear();
/**
* @brief Get number of registered tests
* @return Number of registered tests
*/
static size_t getTestCount();
};
// Macros for Simplified Usage
/**
* @brief Register and execute a single isolated test with minimal boilerplate
*
* Uses variadic macros to automatically handle commas in lambda bodies
*
* @param test_name Name of the test (string)
* @param ... Lambda containing test logic (variadic to handle internal commas)
*
* Example:
* RUN_ISOLATED_TEST("MyTest", []() {
* EXPECT_TRUE(someFunction());
* });
*/
#define RUN_ISOLATED_TEST(test_name, ...) \
do \
{ \
::RcclUnitTesting::ProcessIsolatedTestRunner::registerTest(test_name, __VA_ARGS__); \
bool passed_ = ::RcclUnitTesting::ProcessIsolatedTestRunner::executeAllTests(); \
EXPECT_TRUE(passed_) << "Isolated test '" << test_name << "' failed"; \
} \
while(0)
/**
* @brief Register and execute a single isolated test with environment variables
*
* Uses variadic macros to automatically handle environment variable initializer lists
*
* @param test_name Name of the test (string)
* @param test_body Lambda containing test logic
* @param ... Environment variables as initializer list
*
* Example:
* RUN_ISOLATED_TEST_WITH_ENV("MyTest",
* []() { EXPECT_TRUE(someFunction()); },
* {{"VAR1", "value1"}, {"VAR2", "value2"}});
*
* Note: Uses __VA_ARGS__ to capture environment variables, which automatically
* handles commas in the initializer list without requiring extra parentheses.
*/
#define RUN_ISOLATED_TEST_WITH_ENV(test_name, test_body, ...) \
do \
{ \
::RcclUnitTesting::ProcessIsolatedTestRunner::registerTest( \
test_name, \
test_body, \
__VA_ARGS__ \
); \
bool passed_ = ::RcclUnitTesting::ProcessIsolatedTestRunner::executeAllTests(); \
EXPECT_TRUE(passed_) << "Isolated test '" << test_name << "' failed"; \
} \
while(0)
/**
* @brief Register and execute multiple isolated tests with default options
*
* This macro takes multiple TestConfig objects and executes them all.
* Tests are automatically cleaned up after execution.
*
* Example:
* RUN_ISOLATED_TESTS(
* ProcessIsolatedTestRunner::TestConfig("Test1", []() { ... }),
* ProcessIsolatedTestRunner::TestConfig("Test2", []() { ... })
* .withEnvironment({{"VAR", "value"}}),
* ProcessIsolatedTestRunner::TestConfig("Test3", []() { ... })
* .withTimeout(std::chrono::seconds(60))
* );
*/
#define RUN_ISOLATED_TESTS(...) \
do \
{ \
::RcclUnitTesting::ProcessIsolatedTestRunner::TestConfig configs_[] = {__VA_ARGS__}; \
for(const auto& config_ : configs_) \
{ \
::RcclUnitTesting::ProcessIsolatedTestRunner::registerTest(config_); \
} \
bool passed_ = ::RcclUnitTesting::ProcessIsolatedTestRunner::executeAllTests(); \
EXPECT_TRUE(passed_) << "One or more isolated tests failed"; \
} \
while(0)
/**
* @brief Register and execute multiple isolated tests with custom options
*
* This macro takes execution options and multiple TestConfig objects.
*
* Example:
* ProcessIsolatedTestRunner::ExecutionOptions opts;
* opts.stopOnFirstFailure = true;
* opts.verboseLogging = true;
*
* RUN_ISOLATED_TESTS_WITH_OPTIONS(opts,
* ProcessIsolatedTestRunner::TestConfig("Test1", []() { ... }),
* ProcessIsolatedTestRunner::TestConfig("Test2", []() { ... })
* );
*/
#define RUN_ISOLATED_TESTS_WITH_OPTIONS(options, ...) \
do \
{ \
::RcclUnitTesting::ProcessIsolatedTestRunner::TestConfig configs_[] = {__VA_ARGS__}; \
for(const auto& config_ : configs_) \
{ \
::RcclUnitTesting::ProcessIsolatedTestRunner::registerTest(config_); \
} \
bool passed_ = ::RcclUnitTesting::ProcessIsolatedTestRunner::executeAllTests(options); \
EXPECT_TRUE(passed_) << "One or more isolated tests failed"; \
} \
while(0)
} // namespace RcclUnitTesting
Разница между файлами не показана из-за своего большого размера Загрузить разницу