Added a Process Isolated Test Runner (#1993)
* Added single process isolation support to execute tests
* Address review comments
* Update README
* Removed requirement of explicit call to clear method
* Added macros for simplified usage
* Updated tests to use process isolation framework
* Adjust summary output format for isolated tests
* Updated rccl_wrap tests
* Used process isolation in AllocTests
* Used process isolation and fixed failing tests
* Modified test output, added signal handling
Updated macros to handle lambdas
* Convert argcheck tests to isolated tests
* Convert proxy tests to isolated tests
* Remove non-supported test
* Fixed file descriptor handling and clearing env vars for tests
[ROCm/rccl commit: 7e10267dfd]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
142860442a
Коммит
11ffeda52f
@@ -4,150 +4,187 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <alloc.h>
|
||||
#include <gtest/gtest.h>
|
||||
#include <rccl/rccl.h>
|
||||
#include <alloc.h>
|
||||
|
||||
#include "TestBed.hpp"
|
||||
#include "common/ErrCode.hpp"
|
||||
#include "common/ProcessIsolatedTestRunner.hpp"
|
||||
|
||||
template ncclResult_t ncclCudaMemcpy<float>(float*, float*, size_t);
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(Alloc, ncclIbMallocDebugNonZero) {
|
||||
void* ptr = nullptr;
|
||||
size_t size = 4096;
|
||||
TEST(Alloc, ncclIbMallocDebugNonZero)
|
||||
{
|
||||
void* ptr = nullptr;
|
||||
size_t size = 4096;
|
||||
|
||||
ncclResult_t result = ncclIbMalloc(&ptr, size);
|
||||
ncclResult_t result = ncclIbMalloc(&ptr, size);
|
||||
|
||||
EXPECT_EQ(result, ncclSuccess);
|
||||
ASSERT_NE(ptr, nullptr);
|
||||
EXPECT_EQ(result, ncclSuccess);
|
||||
ASSERT_NE(ptr, nullptr);
|
||||
|
||||
char* char_ptr = static_cast<char*>(ptr);
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
ASSERT_EQ(char_ptr[i], 0);
|
||||
}
|
||||
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
TEST(Alloc, ncclIbMallocDebugZeroSize) {
|
||||
void* ptr = (void*)0xdeadbeef;
|
||||
ncclResult_t result = ncclIbMalloc(&ptr, 0);
|
||||
|
||||
EXPECT_EQ(result, ncclSuccess);
|
||||
EXPECT_EQ(ptr, nullptr);
|
||||
}
|
||||
|
||||
|
||||
TEST(Alloc, ncclCuMemHostAlloc) {
|
||||
void* ptr = NULL;
|
||||
void* handle = NULL;
|
||||
size_t size = 1024;
|
||||
ncclResult_t result = ncclCuMemHostAlloc(&ptr, handle, size);
|
||||
ASSERT_EQ(result, ncclInternalError);
|
||||
}
|
||||
|
||||
TEST(Alloc, ncclCuMemHostFree)
|
||||
char* char_ptr = static_cast<char*>(ptr);
|
||||
for(size_t i = 0; i < size; ++i)
|
||||
{
|
||||
void* dummyPtr = reinterpret_cast<void*>(0x1234); // any dummy address
|
||||
ncclResult_t result = ncclCuMemHostFree(dummyPtr);
|
||||
ASSERT_EQ(result, ncclInternalError);
|
||||
ASSERT_EQ(char_ptr[i], 0);
|
||||
}
|
||||
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
TEST(Alloc, ncclIbMallocDebugZeroSize)
|
||||
{
|
||||
void* ptr = (void*)0xdeadbeef;
|
||||
ncclResult_t result = ncclIbMalloc(&ptr, 0);
|
||||
|
||||
EXPECT_EQ(result, ncclSuccess);
|
||||
EXPECT_EQ(ptr, nullptr);
|
||||
}
|
||||
|
||||
TEST(Alloc, ncclCuMemHostAlloc)
|
||||
{
|
||||
void* ptr = NULL;
|
||||
void* handle = NULL;
|
||||
size_t size = 1024;
|
||||
ncclResult_t result = ncclCuMemHostAlloc(&ptr, handle, size);
|
||||
ASSERT_EQ(result, ncclInternalError);
|
||||
}
|
||||
|
||||
TEST(Alloc, ncclCuMemHostFree)
|
||||
{
|
||||
void* dummyPtr = reinterpret_cast<void*>(0x1234); // any dummy address
|
||||
ncclResult_t result = ncclCuMemHostFree(dummyPtr);
|
||||
ASSERT_EQ(result, ncclInternalError);
|
||||
}
|
||||
|
||||
#if ROCM_VERSION < 70000
|
||||
// This test is only valid for ROCm versions < 7.0.0
|
||||
// In ROCm 7.0.0+, the ncclCuMemAlloc signature changed
|
||||
TEST(Alloc, ncclCuMemAlloc)
|
||||
{
|
||||
void* ptr = reinterpret_cast<void*>(0x1234); // dummy non-null input
|
||||
void* handle = reinterpret_cast<void*>(0x5678); // dummy non-null input
|
||||
size_t size = 1024;
|
||||
hipMemAllocationHandleType type = hipMemHandleTypeNone;
|
||||
ncclResult_t result = ncclCuMemAlloc(&ptr, &handle, type, size);
|
||||
EXPECT_EQ(result, ncclInternalError);
|
||||
}
|
||||
// This test is only valid for ROCm versions < 7.0.0
|
||||
// In ROCm 7.0.0+, the ncclCuMemAlloc signature changed
|
||||
TEST(Alloc, ncclCuMemAlloc)
|
||||
{
|
||||
void* ptr = reinterpret_cast<void*>(0x1234); // dummy non-null input
|
||||
void* handle = reinterpret_cast<void*>(0x5678); // dummy non-null input
|
||||
size_t size = 1024;
|
||||
hipMemAllocationHandleType type = hipMemHandleTypeNone;
|
||||
ncclResult_t result = ncclCuMemAlloc(&ptr, &handle, type, size);
|
||||
EXPECT_EQ(result, ncclInternalError);
|
||||
}
|
||||
|
||||
TEST(Alloc, ncclCuMemFree)
|
||||
{
|
||||
void* dummyPtr = reinterpret_cast<void*>(0xdeadbeef); // arbitrary non-null
|
||||
ncclResult_t result = ncclCuMemFree(dummyPtr);
|
||||
EXPECT_EQ(result, ncclInternalError);
|
||||
}
|
||||
TEST(Alloc, ncclCuMemFree)
|
||||
{
|
||||
void* dummyPtr = reinterpret_cast<void*>(0xdeadbeef); // arbitrary non-null
|
||||
ncclResult_t result = ncclCuMemFree(dummyPtr);
|
||||
EXPECT_EQ(result, ncclInternalError);
|
||||
}
|
||||
|
||||
TEST(Alloc, ncclCuMemAllocAddr)
|
||||
{
|
||||
void* ptr = reinterpret_cast<void*>(0x1111); // Dummy non-null input
|
||||
hipMemGenericAllocationHandle_t handle = reinterpret_cast<hipMemGenericAllocationHandle_t>(0x1234);
|
||||
size_t size = 4096;
|
||||
ncclResult_t result = ncclCuMemAllocAddr(&ptr, &handle, size);
|
||||
ASSERT_EQ(result, ncclInternalError);
|
||||
}
|
||||
TEST(Alloc, ncclCuMemAllocAddr)
|
||||
{
|
||||
void* ptr = reinterpret_cast<void*>(0x1111); // Dummy non-null input
|
||||
hipMemGenericAllocationHandle_t handle
|
||||
= reinterpret_cast<hipMemGenericAllocationHandle_t>(0x1234);
|
||||
size_t size = 4096;
|
||||
ncclResult_t result = ncclCuMemAllocAddr(&ptr, &handle, size);
|
||||
ASSERT_EQ(result, ncclInternalError);
|
||||
}
|
||||
|
||||
TEST(Alloc, ncclCuMemFreeAddr)
|
||||
{
|
||||
void* testPtr = reinterpret_cast<void*>(0xbeefcafe); // Arbitrary non-null pointer
|
||||
ncclResult_t result = ncclCuMemFreeAddr(testPtr);
|
||||
ASSERT_EQ(result, ncclInternalError);
|
||||
}
|
||||
TEST(Alloc, ncclCuMemFreeAddr)
|
||||
{
|
||||
void* testPtr = reinterpret_cast<void*>(0xbeefcafe); // Arbitrary non-null pointer
|
||||
ncclResult_t result = ncclCuMemFreeAddr(testPtr);
|
||||
ASSERT_EQ(result, ncclInternalError);
|
||||
}
|
||||
#endif // ROCM_VERSION < 70000
|
||||
|
||||
TEST(Alloc, NcclCudaMemcpy) {
|
||||
constexpr size_t N = 128;
|
||||
float *d_src = nullptr, *d_dst = nullptr;
|
||||
float h_src[N], h_dst[N];
|
||||
TEST(Alloc, NcclCudaMemcpy)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"NcclCudaMemcpy",
|
||||
[]()
|
||||
{
|
||||
constexpr size_t N = 128;
|
||||
float * d_src = nullptr, *d_dst = nullptr;
|
||||
float h_src[N], h_dst[N];
|
||||
|
||||
for (size_t i = 0; i < N; ++i) h_src[i] = static_cast<float>(i + 1);
|
||||
// Allocate device memory
|
||||
for(size_t i = 0; i < N; ++i)
|
||||
h_src[i] = static_cast<float>(i + 1);
|
||||
// Allocate device memory
|
||||
|
||||
ASSERT_EQ(hipMalloc(&d_src, N * sizeof(float)), hipSuccess);
|
||||
ASSERT_EQ(hipMalloc(&d_dst, N * sizeof(float)), hipSuccess);
|
||||
ASSERT_EQ(hipMalloc(&d_src, N * sizeof(float)), hipSuccess);
|
||||
ASSERT_EQ(hipMalloc(&d_dst, N * sizeof(float)), hipSuccess);
|
||||
|
||||
// Copy from host to device (source buffer)
|
||||
ASSERT_EQ(hipMemcpy(d_src, h_src, N * sizeof(float), hipMemcpyHostToDevice), hipSuccess);
|
||||
// Copy from host to device (source buffer)
|
||||
ASSERT_EQ(
|
||||
hipMemcpy(d_src, h_src, N * sizeof(float), hipMemcpyHostToDevice),
|
||||
hipSuccess
|
||||
);
|
||||
|
||||
// Perform the tested function
|
||||
ncclResult_t result = ncclCudaMemcpy<float>(d_dst, d_src, N);
|
||||
// Perform the tested function
|
||||
ncclResult_t result = ncclCudaMemcpy<float>(d_dst, d_src, N);
|
||||
|
||||
ASSERT_EQ(result, ncclSuccess); // Fixed typo: was ncclSsuccess
|
||||
ASSERT_EQ(result, ncclSuccess);
|
||||
|
||||
// Copy result back to host
|
||||
ASSERT_EQ(hipMemcpy(h_dst, d_dst, N * sizeof(float), hipMemcpyDeviceToHost), hipSuccess);
|
||||
// Copy result back to host
|
||||
ASSERT_EQ(
|
||||
hipMemcpy(h_dst, d_dst, N * sizeof(float), hipMemcpyDeviceToHost),
|
||||
hipSuccess
|
||||
);
|
||||
|
||||
// Check correctness
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
EXPECT_EQ(h_src[i], h_dst[i]) << "Mismatch at index " << i;
|
||||
// Check correctness
|
||||
for(size_t i = 0; i < N; ++i)
|
||||
{
|
||||
EXPECT_EQ(h_src[i], h_dst[i]) << "Mismatch at index " << i;
|
||||
}
|
||||
// Free memory
|
||||
hipFree(d_src);
|
||||
hipFree(d_dst);
|
||||
}
|
||||
// Free memory
|
||||
hipFree(d_src);
|
||||
hipFree(d_dst);
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
TEST(Alloc, ZeroElementMemcpy)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"ZeroElementMemcpy",
|
||||
[]()
|
||||
{
|
||||
float *d_src = nullptr, *d_dst = nullptr;
|
||||
ASSERT_EQ(hipMalloc(&d_src, sizeof(float)), hipSuccess);
|
||||
ASSERT_EQ(hipMalloc(&d_dst, sizeof(float)), hipSuccess);
|
||||
|
||||
TEST(Alloc, ZeroElementMemcpy) {
|
||||
float *d_src = nullptr, *d_dst = nullptr;
|
||||
ASSERT_EQ(hipMalloc(&d_src, sizeof(float)), hipSuccess);
|
||||
ASSERT_EQ(hipMalloc(&d_dst, sizeof(float)), hipSuccess);
|
||||
ncclResult_t result = ncclCudaMemcpy<float>(d_dst, d_src, 0);
|
||||
EXPECT_EQ(result, ncclSuccess) << "Zero-element copy should succeed (no-op)";
|
||||
|
||||
ncclResult_t result = ncclCudaMemcpy<float>(d_dst, d_src, 0);
|
||||
EXPECT_EQ(result, ncclSuccess) << "Zero-element copy should succeed (no-op)";
|
||||
hipFree(d_src);
|
||||
hipFree(d_dst);
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
hipFree(d_src);
|
||||
hipFree(d_dst);
|
||||
}
|
||||
TEST(Alloc, MemcpyNullSrcOrDstPointer)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"MemcpyNullSrcOrDstPointer",
|
||||
[]()
|
||||
{
|
||||
constexpr size_t N = 16;
|
||||
float* d_valid = nullptr;
|
||||
ASSERT_EQ(hipMalloc(&d_valid, N * sizeof(float)), hipSuccess);
|
||||
|
||||
TEST(Alloc, MemcpyNullSrcOrDstPointer) {
|
||||
constexpr size_t N = 16;
|
||||
float* d_valid = nullptr;
|
||||
ASSERT_EQ(hipMalloc(&d_valid, N * sizeof(float)), hipSuccess);
|
||||
// Case 1: src is nullptr
|
||||
ncclResult_t result = ncclCudaMemcpy<float>(d_valid, nullptr, N);
|
||||
EXPECT_EQ(result, ncclUnhandledCudaError)
|
||||
<< "Expected ncclUnhandledCudaError when src is nullptr";
|
||||
|
||||
// Case 1: src is nullptr
|
||||
ncclResult_t result = ncclCudaMemcpy<float>(d_valid, nullptr, N);
|
||||
EXPECT_EQ(result, ncclUnhandledCudaError) << "Expected ncclUnhandledCudaError when src is nullptr";
|
||||
// Case 2: dst is nullptr
|
||||
result = ncclCudaMemcpy<float>(nullptr, d_valid, N);
|
||||
EXPECT_EQ(result, ncclUnhandledCudaError)
|
||||
<< "Expected ncclUnhandledCudaError when dst is nullptr";
|
||||
|
||||
// Case 2: dst is nullptr
|
||||
result = ncclCudaMemcpy<float>(nullptr, d_valid, N);
|
||||
EXPECT_EQ(result, ncclUnhandledCudaError) << "Expected ncclUnhandledCudaError when dst is nullptr";
|
||||
|
||||
hipFree(d_valid);
|
||||
}
|
||||
} //namespace rccl
|
||||
hipFree(d_valid);
|
||||
}
|
||||
);
|
||||
}
|
||||
} // namespace RcclUnitTesting
|
||||
@@ -4,324 +4,626 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include <gtest/gtest.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include "argcheck.h"
|
||||
#include "comm.h"
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common/ErrCode.hpp"
|
||||
#include "common/ProcessIsolatedTestRunner.hpp"
|
||||
|
||||
class ArgCheckTest : public ::testing::Test {
|
||||
protected:
|
||||
ncclComm_t comm;
|
||||
struct ncclInfo *info;
|
||||
int *sendDevicePtr = nullptr;
|
||||
int *recvDevicePtr = nullptr;
|
||||
// Helper struct for ArgCheck tests (NOT a fixture - used inside isolated tests)
|
||||
struct ArgCheckTestEnvironment
|
||||
{
|
||||
ncclComm_t comm;
|
||||
struct ncclInfo* info;
|
||||
int* sendDevicePtr = nullptr;
|
||||
int* recvDevicePtr = nullptr;
|
||||
|
||||
// Helper function to set up valid ncclInfo for boundary testing
|
||||
void SetupValidInfo() {
|
||||
// Set up valid info structure
|
||||
info->comm = comm;
|
||||
info->root = 0; // Valid root
|
||||
info->datatype = (ncclDataType_t)0; // Valid datatype
|
||||
info->op = (ncclRedOp_t)0; // Valid reduction operation
|
||||
info->coll = ncclFuncBroadcast; // Valid collective operation
|
||||
info->sendbuff = nullptr; // Will be set per test if needed
|
||||
info->recvbuff = nullptr; // Will be set per test if needed
|
||||
info->count = 10; // Valid count
|
||||
info->opName = "TestOp"; // Valid operation name
|
||||
}
|
||||
|
||||
// Helper function for tests requiring device memory
|
||||
void SetupValidBufferWithDeviceMemory() {
|
||||
// Set the active device to match comm->cudaDev
|
||||
hipError_t errSetDevice = hipSetDevice(comm->cudaDev);
|
||||
ASSERT_EQ(errSetDevice, hipSuccess);
|
||||
|
||||
// Allocate device memory
|
||||
hipError_t errSend = hipMalloc(&sendDevicePtr, sizeof(int));
|
||||
ASSERT_EQ(errSend, hipSuccess);
|
||||
hipError_t errRecv = hipMalloc(&recvDevicePtr, sizeof(int));
|
||||
ASSERT_EQ(errRecv, hipSuccess);
|
||||
|
||||
// Set device pointers
|
||||
info->sendbuff = sendDevicePtr;
|
||||
info->recvbuff = recvDevicePtr;
|
||||
}
|
||||
|
||||
// Helper to clean up device memory
|
||||
void CleanupDeviceMemory() {
|
||||
if (sendDevicePtr) {
|
||||
hipFree(sendDevicePtr);
|
||||
sendDevicePtr = nullptr;
|
||||
// Helper function to set up valid ncclInfo for boundary testing
|
||||
void SetupValidInfo()
|
||||
{
|
||||
// Set up valid info structure
|
||||
info->comm = comm;
|
||||
info->root = 0; // Valid root
|
||||
info->datatype = (ncclDataType_t)0; // Valid datatype
|
||||
info->op = (ncclRedOp_t)0; // Valid reduction operation
|
||||
info->coll = ncclFuncBroadcast; // Valid collective operation
|
||||
info->sendbuff = nullptr; // Will be set per test if needed
|
||||
info->recvbuff = nullptr; // Will be set per test if needed
|
||||
info->count = 10; // Valid count
|
||||
info->opName = "TestOp"; // Valid operation name
|
||||
}
|
||||
if (recvDevicePtr) {
|
||||
hipFree(recvDevicePtr);
|
||||
recvDevicePtr = nullptr;
|
||||
|
||||
// Helper function for tests requiring device memory
|
||||
void SetupValidBufferWithDeviceMemory()
|
||||
{
|
||||
// Set the active device to match comm->cudaDev
|
||||
hipError_t errSetDevice = hipSetDevice(comm->cudaDev);
|
||||
ASSERT_EQ(errSetDevice, hipSuccess);
|
||||
|
||||
// Allocate device memory
|
||||
hipError_t errSend = hipMalloc(&sendDevicePtr, sizeof(int));
|
||||
ASSERT_EQ(errSend, hipSuccess);
|
||||
hipError_t errRecv = hipMalloc(&recvDevicePtr, sizeof(int));
|
||||
ASSERT_EQ(errRecv, hipSuccess);
|
||||
|
||||
// Set device pointers
|
||||
info->sendbuff = sendDevicePtr;
|
||||
info->recvbuff = recvDevicePtr;
|
||||
}
|
||||
}
|
||||
|
||||
void SetUp() override {
|
||||
// Allocate and zero-initialize ncclComm as a pointer
|
||||
comm = (struct ncclComm *)calloc(1, sizeof(struct ncclComm));
|
||||
ASSERT_NE(comm, nullptr) << "Failed to allocate ncclComm";
|
||||
|
||||
// Initialize the communicator with required fields
|
||||
comm->cudaDev = 0;
|
||||
comm->nRanks = 4;
|
||||
comm->checkPointers = true;
|
||||
comm->rank = 0;
|
||||
|
||||
comm->startMagic = NCCL_MAGIC;
|
||||
comm->endMagic = NCCL_MAGIC;
|
||||
|
||||
// Verify the magic values were set correctly
|
||||
ASSERT_EQ(comm->startMagic, NCCL_MAGIC) << "startMagic not set correctly";
|
||||
ASSERT_EQ(comm->endMagic, NCCL_MAGIC) << "endMagic not set correctly";
|
||||
|
||||
// Allocate and zero-initialize ncclInfo as a pointer
|
||||
info = (ncclInfo *)calloc(1, sizeof(ncclInfo));
|
||||
ASSERT_NE(info, nullptr) << "Failed to allocate ncclInfo";
|
||||
|
||||
SetupValidInfo();
|
||||
|
||||
SetupValidBufferWithDeviceMemory();
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
// Free the allocated memory
|
||||
CleanupDeviceMemory();
|
||||
if (info) {
|
||||
free(info);
|
||||
info = nullptr;
|
||||
// Helper to clean up device memory
|
||||
void CleanupDeviceMemory()
|
||||
{
|
||||
if(sendDevicePtr)
|
||||
{
|
||||
hipFree(sendDevicePtr);
|
||||
sendDevicePtr = nullptr;
|
||||
}
|
||||
if(recvDevicePtr)
|
||||
{
|
||||
hipFree(recvDevicePtr);
|
||||
recvDevicePtr = nullptr;
|
||||
}
|
||||
}
|
||||
if (comm) {
|
||||
free(comm);
|
||||
comm = nullptr;
|
||||
|
||||
void setup()
|
||||
{
|
||||
// Allocate and zero-initialize ncclComm as a pointer
|
||||
comm = (struct ncclComm*)calloc(1, sizeof(struct ncclComm));
|
||||
ASSERT_NE(comm, nullptr) << "Failed to allocate ncclComm";
|
||||
|
||||
// Initialize the communicator with required fields
|
||||
comm->cudaDev = 0;
|
||||
comm->nRanks = 4;
|
||||
comm->checkPointers = true;
|
||||
comm->rank = 0;
|
||||
|
||||
comm->startMagic = NCCL_MAGIC;
|
||||
comm->endMagic = NCCL_MAGIC;
|
||||
|
||||
// Verify the magic values were set correctly
|
||||
ASSERT_EQ(comm->startMagic, NCCL_MAGIC) << "startMagic not set correctly";
|
||||
ASSERT_EQ(comm->endMagic, NCCL_MAGIC) << "endMagic not set correctly";
|
||||
|
||||
// Allocate and zero-initialize ncclInfo as a pointer
|
||||
info = (ncclInfo*)calloc(1, sizeof(ncclInfo));
|
||||
ASSERT_NE(info, nullptr) << "Failed to allocate ncclInfo";
|
||||
|
||||
SetupValidInfo();
|
||||
|
||||
SetupValidBufferWithDeviceMemory();
|
||||
}
|
||||
|
||||
void cleanup()
|
||||
{
|
||||
// Free the allocated memory
|
||||
CleanupDeviceMemory();
|
||||
if(info)
|
||||
{
|
||||
free(info);
|
||||
info = nullptr;
|
||||
}
|
||||
if(comm)
|
||||
{
|
||||
free(comm);
|
||||
comm = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(ArgCheckTest, CudaPtrCheck_ValidPointer) {
|
||||
int *devicePtr = nullptr;
|
||||
hipError_t err = hipMalloc(&devicePtr, sizeof(int));
|
||||
ASSERT_EQ(err, hipSuccess);
|
||||
TEST(ArgCheckTest, CudaPtrCheck_ValidPointer)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"CudaPtrCheck_ValidPointer",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
ncclResult_t result = CudaPtrCheck(devicePtr, comm, "devicePtr", "TestOp");
|
||||
EXPECT_EQ(result, ncclSuccess);
|
||||
int* devicePtr = nullptr;
|
||||
hipError_t err = hipMalloc(&devicePtr, sizeof(int));
|
||||
ASSERT_EQ(err, hipSuccess);
|
||||
|
||||
hipFree(devicePtr);
|
||||
ncclResult_t result = CudaPtrCheck(devicePtr, env.comm, "devicePtr", "TestOp");
|
||||
EXPECT_EQ(result, ncclSuccess);
|
||||
|
||||
hipFree(devicePtr);
|
||||
env.cleanup();
|
||||
INFO("Test 'CudaPtrCheck_ValidPointer' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, CudaPtrCheck_NullPointer) {
|
||||
ncclResult_t result = CudaPtrCheck(nullptr, comm, "invalidPtr", "TestOp");
|
||||
EXPECT_EQ(result, ncclInvalidArgument);
|
||||
TEST(ArgCheckTest, CudaPtrCheck_NullPointer)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"CudaPtrCheck_NullPointer",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
ncclResult_t result = CudaPtrCheck(nullptr, env.comm, "invalidPtr", "TestOp");
|
||||
EXPECT_EQ(result, ncclInvalidArgument);
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'CudaPtrCheck_NullPointer' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, CudaPtrCheck_DifferentDevicePointer) {
|
||||
int *devicePtr = nullptr;
|
||||
hipSetDevice(1);
|
||||
hipError_t err = hipMalloc(&devicePtr, sizeof(int));
|
||||
ASSERT_EQ(err, hipSuccess);
|
||||
TEST(ArgCheckTest, CudaPtrCheck_DifferentDevicePointer)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"CudaPtrCheck_DifferentDevicePointer",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
ncclResult_t result = CudaPtrCheck(devicePtr, comm, "devicePtr", "TestOp");
|
||||
EXPECT_EQ(result, ncclInvalidArgument);
|
||||
int* devicePtr = nullptr;
|
||||
hipSetDevice(1);
|
||||
hipError_t err = hipMalloc(&devicePtr, sizeof(int));
|
||||
ASSERT_EQ(err, hipSuccess);
|
||||
|
||||
hipFree(devicePtr);
|
||||
hipSetDevice(comm->cudaDev);
|
||||
ncclResult_t result = CudaPtrCheck(devicePtr, env.comm, "devicePtr", "TestOp");
|
||||
EXPECT_EQ(result, ncclInvalidArgument);
|
||||
|
||||
hipFree(devicePtr);
|
||||
hipSetDevice(env.comm->cudaDev);
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'CudaPtrCheck_DifferentDevicePointer' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, CudaPtrCheck_HostMemoryPointer) {
|
||||
// Test with host memory instead of device memory
|
||||
int *hostPtr = (int *)malloc(sizeof(int));
|
||||
ASSERT_NE(hostPtr, nullptr) << "Failed to allocate host memory";
|
||||
TEST(ArgCheckTest, CudaPtrCheck_HostMemoryPointer)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"CudaPtrCheck_HostMemoryPointer",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
*hostPtr = 42; // Initialize the memory
|
||||
// Test with host memory instead of device memory
|
||||
int* hostPtr = (int*)malloc(sizeof(int));
|
||||
ASSERT_NE(hostPtr, nullptr) << "Failed to allocate host memory";
|
||||
|
||||
// This should fail because host memory is not device memory
|
||||
ncclResult_t result = CudaPtrCheck(hostPtr, comm, "hostPtr", "TestOp");
|
||||
*hostPtr = 42; // Initialize the memory
|
||||
|
||||
// Host memory should be rejected by CudaPtrCheck
|
||||
EXPECT_EQ(result, ncclInvalidArgument)
|
||||
<< "Host memory should be rejected by CudaPtrCheck";
|
||||
// This should fail because host memory is not device memory
|
||||
ncclResult_t result = CudaPtrCheck(hostPtr, env.comm, "hostPtr", "TestOp");
|
||||
|
||||
free(hostPtr);
|
||||
// Host memory should be rejected by CudaPtrCheck
|
||||
EXPECT_EQ(result, ncclInvalidArgument)
|
||||
<< "Host memory should be rejected by CudaPtrCheck";
|
||||
|
||||
free(hostPtr);
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'CudaPtrCheck_HostMemoryPointer' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, PtrCheck_ValidPointer) {
|
||||
int value = 42;
|
||||
ncclResult_t result = PtrCheck(&value, "TestOp", "value");
|
||||
ASSERT_EQ(result, ncclSuccess);
|
||||
TEST(ArgCheckTest, PtrCheck_ValidPointer)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"PtrCheck_ValidPointer",
|
||||
[]()
|
||||
{
|
||||
int value = 42;
|
||||
ncclResult_t result = PtrCheck(&value, "TestOp", "value");
|
||||
ASSERT_EQ(result, ncclSuccess);
|
||||
INFO("Test 'PtrCheck_ValidPointer' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, PtrCheck_NullPointer) {
|
||||
ncclResult_t result = PtrCheck(nullptr, "TestOp", "value");
|
||||
ASSERT_EQ(result, ncclInvalidArgument);
|
||||
TEST(ArgCheckTest, PtrCheck_NullPointer)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"PtrCheck_NullPointer",
|
||||
[]()
|
||||
{
|
||||
ncclResult_t result = PtrCheck(nullptr, "TestOp", "value");
|
||||
ASSERT_EQ(result, ncclInvalidArgument);
|
||||
INFO("Test 'PtrCheck_NullPointer' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, CommCheck_ValidComm) {
|
||||
comm->startMagic = NCCL_MAGIC;
|
||||
comm->endMagic = NCCL_MAGIC;
|
||||
TEST(ArgCheckTest, CommCheck_ValidComm)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"CommCheck_ValidComm",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
// Verify magic values are still correct (should be set in SetUp())
|
||||
ASSERT_EQ(comm->startMagic, NCCL_MAGIC) << "startMagic was corrupted";
|
||||
ASSERT_EQ(comm->endMagic, NCCL_MAGIC) << "endMagic was corrupted";
|
||||
env.comm->startMagic = NCCL_MAGIC;
|
||||
env.comm->endMagic = NCCL_MAGIC;
|
||||
|
||||
// Call CommCheck and verify the result
|
||||
ncclResult_t result = CommCheck(comm, "TestOp", "testComm");
|
||||
EXPECT_EQ(result, ncclSuccess) << "Failed for valid communicator";
|
||||
// Verify magic values are still correct (should be set in setup())
|
||||
ASSERT_EQ(env.comm->startMagic, NCCL_MAGIC) << "startMagic was corrupted";
|
||||
ASSERT_EQ(env.comm->endMagic, NCCL_MAGIC) << "endMagic was corrupted";
|
||||
|
||||
// Call CommCheck and verify the result
|
||||
ncclResult_t result = CommCheck(env.comm, "TestOp", "testComm");
|
||||
EXPECT_EQ(result, ncclSuccess) << "Failed for valid communicator";
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'CommCheck_ValidComm' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, CommCheck_NullComm) {
|
||||
ncclResult_t result = CommCheck(nullptr, "TestOp", "comm");
|
||||
ASSERT_EQ(result, ncclInvalidArgument);
|
||||
TEST(ArgCheckTest, CommCheck_NullComm)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"CommCheck_NullComm",
|
||||
[]()
|
||||
{
|
||||
ncclResult_t result = CommCheck(nullptr, "TestOp", "comm");
|
||||
ASSERT_EQ(result, ncclInvalidArgument);
|
||||
INFO("Test 'CommCheck_NullComm' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, CommCheck_CorruptedStartMagic) {
|
||||
// Corrupt only startMagic, keep endMagic valid
|
||||
comm->startMagic = 1; // Corrupt startMagic
|
||||
comm->endMagic = NCCL_MAGIC; // Keep endMagic valid
|
||||
TEST(ArgCheckTest, CommCheck_CorruptedStartMagic)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"CommCheck_CorruptedStartMagic",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
// Call CommCheck and verify the result
|
||||
ncclResult_t result = CommCheck(comm, "TestOp", "comm");
|
||||
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted startMagic";
|
||||
// Corrupt only startMagic, keep endMagic valid
|
||||
env.comm->startMagic = 1; // Corrupt startMagic
|
||||
env.comm->endMagic = NCCL_MAGIC; // Keep endMagic valid
|
||||
|
||||
// Call CommCheck and verify the result
|
||||
ncclResult_t result = CommCheck(env.comm, "TestOp", "comm");
|
||||
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted startMagic";
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'CommCheck_CorruptedStartMagic' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, CommCheck_CorruptedEndMagic) {
|
||||
// Keep startMagic valid, corrupt only endMagic
|
||||
comm->startMagic = NCCL_MAGIC; // Keep startMagic valid
|
||||
comm->endMagic = 1; // Corrupt endMagic
|
||||
TEST(ArgCheckTest, CommCheck_CorruptedEndMagic)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"CommCheck_CorruptedEndMagic",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
// Call CommCheck and verify the result
|
||||
ncclResult_t result = CommCheck(comm, "TestOp", "comm");
|
||||
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted endMagic";
|
||||
// Keep startMagic valid, corrupt only endMagic
|
||||
env.comm->startMagic = NCCL_MAGIC; // Keep startMagic valid
|
||||
env.comm->endMagic = 1; // Corrupt endMagic
|
||||
|
||||
// Call CommCheck and verify the result
|
||||
ncclResult_t result = CommCheck(env.comm, "TestOp", "comm");
|
||||
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted endMagic";
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'CommCheck_CorruptedEndMagic' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, CommCheck_CorruptedBothMagics) {
|
||||
// Corrupt both startMagic and endMagic
|
||||
comm->startMagic = 1; // Corrupt startMagic
|
||||
comm->endMagic = 1; // Corrupt endMagic
|
||||
TEST(ArgCheckTest, CommCheck_CorruptedBothMagics)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"CommCheck_CorruptedBothMagics",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
// Call CommCheck and verify the result
|
||||
ncclResult_t result = CommCheck(comm, "TestOp", "comm");
|
||||
EXPECT_EQ(result, ncclInvalidArgument)
|
||||
<< "Failed for corrupted both magic values";
|
||||
// Corrupt both startMagic and endMagic
|
||||
env.comm->startMagic = 1; // Corrupt startMagic
|
||||
env.comm->endMagic = 1; // Corrupt endMagic
|
||||
|
||||
// Call CommCheck and verify the result
|
||||
ncclResult_t result = CommCheck(env.comm, "TestOp", "comm");
|
||||
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for corrupted both magic values";
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'CommCheck_CorruptedBothMagics' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, ArgsCheck_InvalidRoot_NegativeValue) {
|
||||
info->root = -1; // Invalid root (< 0)
|
||||
TEST(ArgCheckTest, ArgsCheck_InvalidRoot_NegativeValue)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"ArgsCheck_InvalidRoot_NegativeValue",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
ncclResult_t result = ArgsCheck(info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid root < 0";
|
||||
env.info->root = -1; // Invalid root (< 0)
|
||||
|
||||
ncclResult_t result = ArgsCheck(env.info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid root < 0";
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'ArgsCheck_InvalidRoot_NegativeValue' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, ArgsCheck_InvalidRoot_ExceedsNRanks) {
|
||||
info->root = comm->nRanks; // Invalid root (>= nRanks)
|
||||
TEST(ArgCheckTest, ArgsCheck_InvalidRoot_ExceedsNRanks)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"ArgsCheck_InvalidRoot_ExceedsNRanks",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
ncclResult_t result = ArgsCheck(info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid root >= nRanks";
|
||||
env.info->root = env.comm->nRanks; // Invalid root (>= nRanks)
|
||||
|
||||
ncclResult_t result = ArgsCheck(env.info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid root >= nRanks";
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'ArgsCheck_InvalidRoot_ExceedsNRanks' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, ArgsCheck_InvalidDatatype_NegativeValue) {
|
||||
info->datatype = (ncclDataType_t)-1; // Invalid datatype (< 0)
|
||||
TEST(ArgCheckTest, ArgsCheck_InvalidDatatype_NegativeValue)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"ArgsCheck_InvalidDatatype_NegativeValue",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
ncclResult_t result = ArgsCheck(info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid datatype < 0";
|
||||
env.info->datatype = (ncclDataType_t)-1; // Invalid datatype (< 0)
|
||||
|
||||
ncclResult_t result = ArgsCheck(env.info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid datatype < 0";
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'ArgsCheck_InvalidDatatype_NegativeValue' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, ArgsCheck_InvalidDatatype_ExceedsMaxValue) {
|
||||
info->datatype =
|
||||
(ncclDataType_t)ncclNumTypes; // Invalid datatype (>= ncclNumTypes)
|
||||
TEST(ArgCheckTest, ArgsCheck_InvalidDatatype_ExceedsMaxValue)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"ArgsCheck_InvalidDatatype_ExceedsMaxValue",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
ncclResult_t result = ArgsCheck(info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument)
|
||||
<< "Failed for invalid datatype >= ncclNumTypes";
|
||||
env.info->datatype = (ncclDataType_t)ncclNumTypes; // Invalid datatype (>= ncclNumTypes)
|
||||
|
||||
ncclResult_t result = ArgsCheck(env.info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid datatype >= ncclNumTypes";
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'ArgsCheck_InvalidDatatype_ExceedsMaxValue' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, ArgsCheck_InvalidReductionOperation_NegativeValue) {
|
||||
info->op = (ncclRedOp_t)-1; // Invalid reduction operation (< 0)
|
||||
TEST(ArgCheckTest, ArgsCheck_InvalidReductionOperation_NegativeValue)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"ArgsCheck_InvalidReductionOperation_NegativeValue",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
ncclResult_t result = ArgsCheck(info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument)
|
||||
<< "Failed for invalid reduction operation < 0";
|
||||
env.info->op = (ncclRedOp_t)-1; // Invalid reduction operation (< 0)
|
||||
|
||||
ncclResult_t result = ArgsCheck(env.info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid reduction operation < 0";
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'ArgsCheck_InvalidReductionOperation_NegativeValue' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, ArgsCheck_InvalidReductionOperation_ExceedsMaxValue) {
|
||||
info->op =
|
||||
(ncclRedOp_t)ncclNumOps; // Invalid reduction operation (>= ncclNumOps)
|
||||
TEST(ArgCheckTest, ArgsCheck_InvalidReductionOperation_ExceedsMaxValue)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"ArgsCheck_InvalidReductionOperation_ExceedsMaxValue",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
ncclResult_t result = ArgsCheck(info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument)
|
||||
<< "Failed for invalid reduction operation >= ncclNumOps";
|
||||
env.info->op = (ncclRedOp_t)ncclNumOps; // Invalid reduction operation (>= ncclNumOps)
|
||||
|
||||
ncclResult_t result = ArgsCheck(env.info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument)
|
||||
<< "Failed for invalid reduction operation >= ncclNumOps";
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'ArgsCheck_InvalidReductionOperation_ExceedsMaxValue' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, ArgsCheck_InvalidCommunicatorPointers) {
|
||||
info->op = (ncclRedOp_t)0; // Valid reduction operation
|
||||
if (info->sendbuff) {
|
||||
hipFree((void *)info->sendbuff);
|
||||
info->sendbuff = nullptr; // Invalid send buffer
|
||||
}
|
||||
if (info->recvbuff) {
|
||||
hipFree((void *)info->recvbuff);
|
||||
info->recvbuff = nullptr; // Invalid receive buffer
|
||||
}
|
||||
TEST(ArgCheckTest, ArgsCheck_InvalidCommunicatorPointers)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"ArgsCheck_InvalidCommunicatorPointers",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
ncclResult_t result = ArgsCheck(info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument)
|
||||
<< "Failed for invalid communicator pointers";
|
||||
env.info->op = (ncclRedOp_t)0; // Valid reduction operation
|
||||
if(env.info->sendbuff)
|
||||
{
|
||||
hipFree((void*)env.info->sendbuff);
|
||||
env.info->sendbuff = nullptr; // Invalid send buffer
|
||||
}
|
||||
if(env.info->recvbuff)
|
||||
{
|
||||
hipFree((void*)env.info->recvbuff);
|
||||
env.info->recvbuff = nullptr; // Invalid receive buffer
|
||||
}
|
||||
|
||||
ncclResult_t result = ArgsCheck(env.info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid communicator pointers";
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'ArgsCheck_InvalidCommunicatorPointers' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, ArgsCheck_InvalidReductionOperationOutOfRange) {
|
||||
info->op = (ncclRedOp_t)5; // Invalid reduction operation (out of range)
|
||||
TEST(ArgCheckTest, ArgsCheck_InvalidReductionOperationOutOfRange)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"ArgsCheck_InvalidReductionOperationOutOfRange",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
ncclResult_t result = ArgsCheck(info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument)
|
||||
<< "Failed for invalid reduction operation";
|
||||
env.info->op = (ncclRedOp_t)5; // Invalid reduction operation (out of range)
|
||||
|
||||
ncclResult_t result = ArgsCheck(env.info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument) << "Failed for invalid reduction operation";
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'ArgsCheck_InvalidReductionOperationOutOfRange' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, ArgsCheck_UserDefinedReductionOperationInvalid) {
|
||||
// Test case: User-defined reduction operation with freeNext != -1
|
||||
info->op = (ncclRedOp_t)(ncclNumOps +
|
||||
1); // Set op to a user-defined reduction operation
|
||||
TEST(ArgCheckTest, ArgsCheck_UserDefinedReductionOperationInvalid)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"ArgsCheck_UserDefinedReductionOperationInvalid",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
ncclResult_t result = ArgsCheck(info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument)
|
||||
<< "Failed for user-defined reduction operation with freeNext != -1";
|
||||
// Test case: User-defined reduction operation with freeNext != -1
|
||||
env.info->op
|
||||
= (ncclRedOp_t)(ncclNumOps + 1); // Set op to a user-defined reduction operation
|
||||
|
||||
ncclResult_t result = ArgsCheck(env.info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument)
|
||||
<< "Failed for user-defined reduction operation with freeNext != -1";
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'ArgsCheck_UserDefinedReductionOperationInvalid' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, ArgsCheck_SendAndRecvFunction) {
|
||||
info->recvbuff =
|
||||
recvDevicePtr; // Use allocated device pointer for receive buffer
|
||||
TEST(ArgCheckTest, ArgsCheck_SendAndRecvFunction)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"ArgsCheck_SendAndRecvFunction",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
// Test both ncclFuncSend and ncclFuncRecv
|
||||
for (auto coll : {ncclFuncSend, ncclFuncRecv}) {
|
||||
info->coll = coll; // Set the collective operation
|
||||
env.info->recvbuff
|
||||
= env.recvDevicePtr; // Use allocated device pointer for receive buffer
|
||||
|
||||
// Call ArgsCheck and verify the result
|
||||
ncclResult_t result = ArgsCheck(info);
|
||||
ASSERT_EQ(result, ncclSuccess) << "Failed for coll = " << coll;
|
||||
}
|
||||
// Test both ncclFuncSend and ncclFuncRecv
|
||||
for(auto coll : {ncclFuncSend, ncclFuncRecv})
|
||||
{
|
||||
env.info->coll = coll; // Set the collective operation
|
||||
|
||||
// Call ArgsCheck and verify the result
|
||||
ncclResult_t result = ArgsCheck(env.info);
|
||||
ASSERT_EQ(result, ncclSuccess) << "Failed for coll = " << coll;
|
||||
}
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'ArgsCheck_SendAndRecvFunction' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, ArgsCheck_CollNotReduce) {
|
||||
// Case: info->coll != ncclFuncReduce
|
||||
info->coll = ncclFuncBroadcast; // Set coll to ncclFuncBroadcast
|
||||
TEST(ArgCheckTest, ArgsCheck_CollNotReduce)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"ArgsCheck_CollNotReduce",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
ncclResult_t result = ArgsCheck(info);
|
||||
EXPECT_EQ(result, ncclSuccess) << "Failed for coll != ncclFuncReduce";
|
||||
// Case: env.info->coll != ncclFuncReduce
|
||||
env.info->coll = ncclFuncBroadcast; // Set coll to ncclFuncBroadcast
|
||||
|
||||
ncclResult_t result = ArgsCheck(env.info);
|
||||
EXPECT_EQ(result, ncclSuccess) << "Failed for coll != ncclFuncReduce";
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'ArgsCheck_CollNotReduce' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, ArgsCheck_ReduceCollWithRootRank) {
|
||||
// Case: info->coll == ncclFuncReduce and info->comm->rank == info->root
|
||||
info->coll = ncclFuncReduce; // Set coll to ncclFuncReduce
|
||||
TEST(ArgCheckTest, ArgsCheck_ReduceCollWithRootRank)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"ArgsCheck_ReduceCollWithRootRank",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
ncclResult_t result = ArgsCheck(info);
|
||||
EXPECT_EQ(result, ncclSuccess)
|
||||
<< "Failed for coll == ncclFuncReduce and rank == root";
|
||||
// Case: env.info->coll == ncclFuncReduce and env.info->env.comm->rank == env.info->root
|
||||
env.info->coll = ncclFuncReduce; // Set coll to ncclFuncReduce
|
||||
|
||||
ncclResult_t result = ArgsCheck(env.info);
|
||||
EXPECT_EQ(result, ncclSuccess) << "Failed for coll == ncclFuncReduce and rank == root";
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'ArgsCheck_ReduceCollWithRootRank' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(ArgCheckTest, ArgsCheck_ReduceCollWithNonRootRank) {
|
||||
comm->rank = 1; // Set rank to 1 (non-root)
|
||||
TEST(ArgCheckTest, ArgsCheck_ReduceCollWithNonRootRank)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"ArgsCheck_ReduceCollWithNonRootRank",
|
||||
[]()
|
||||
{
|
||||
ArgCheckTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
ncclResult_t result = ArgsCheck(info);
|
||||
EXPECT_EQ(result, ncclSuccess)
|
||||
<< "Failed for coll == ncclFuncReduce and rank != root";
|
||||
env.comm->rank = 1; // Set rank to 1 (non-root)
|
||||
|
||||
ncclResult_t result = ArgsCheck(env.info);
|
||||
EXPECT_EQ(result, ncclSuccess) << "Failed for coll == ncclFuncReduce and rank != root";
|
||||
|
||||
env.cleanup();
|
||||
INFO("Test 'ArgsCheck_ReduceCollWithNonRootRank' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@@ -207,6 +207,7 @@ if(BUILD_TESTS)
|
||||
TransportTests.cpp
|
||||
common/main_fixtures.cpp
|
||||
common/EnvVars.cpp
|
||||
common/ProcessIsolatedTestRunner.cpp
|
||||
graph/XmlTests.cpp
|
||||
)
|
||||
|
||||
|
||||
@@ -4,27 +4,120 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include <gtest/gtest.h>
|
||||
#include <cstring>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include <cstring>
|
||||
|
||||
#include "comm.h"
|
||||
#include "info.h"
|
||||
#include "common/ProcessIsolatedTestRunner.hpp"
|
||||
#include "enqueue.h"
|
||||
#include "info.h"
|
||||
#include "utils.h"
|
||||
|
||||
class EnqueueTests : public ::testing::Test {
|
||||
protected:
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
|
||||
// Simple test kernel for validating ncclInitKernelsForDevice
|
||||
__global__ void simpleTestKernel(int* data)
|
||||
{
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if(data)
|
||||
data[tid] = tid;
|
||||
}
|
||||
|
||||
// Helper function to test ncclInitKernelsForDevice with a real kernel
|
||||
ncclResult_t testKernelAttributes(void* kernelFn, size_t* maxStackSize)
|
||||
{
|
||||
if(!kernelFn || !maxStackSize)
|
||||
return ncclInvalidArgument;
|
||||
|
||||
*maxStackSize = 0;
|
||||
hipFuncAttributes attr = {0};
|
||||
|
||||
hipError_t errcode = hipFuncGetAttributes(&attr, kernelFn);
|
||||
if(errcode != hipSuccess)
|
||||
return ncclSystemError;
|
||||
|
||||
*maxStackSize = attr.localSizeBytes;
|
||||
return ncclSuccess; // ncclSuccess
|
||||
}
|
||||
|
||||
// Helper function to test shared memory limit checking with a real kernel
|
||||
// ncclMaxSharedMem: For gfx906 (cudaArch 906) with WarpSize 64, this is typically 32832 bytes
|
||||
ncclResult_t testKernelSharedMemoryLimit(
|
||||
void* kernelFn, int cudaArch, int maxSharedMem, size_t* maxStackSize, int ncclMaxSharedMem
|
||||
)
|
||||
{
|
||||
if(!kernelFn)
|
||||
return ncclInvalidArgument;
|
||||
|
||||
ncclResult_t result = ncclSuccess;
|
||||
if(maxStackSize)
|
||||
*maxStackSize = 0;
|
||||
|
||||
hipFuncAttributes attr = {0};
|
||||
hipError_t errcode = hipFuncGetAttributes(&attr, kernelFn);
|
||||
if(errcode != hipSuccess)
|
||||
{
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
if(maxStackSize)
|
||||
{
|
||||
*maxStackSize = attr.localSizeBytes;
|
||||
}
|
||||
|
||||
// Test the shared memory limit check (mimics enqueue.cc lines 135-146)
|
||||
if(ncclMaxSharedMem != 0)
|
||||
{
|
||||
int sharedMemSize = ncclMaxSharedMem;
|
||||
|
||||
if(sharedMemSize > (maxSharedMem - attr.sharedSizeBytes))
|
||||
{
|
||||
WARN(
|
||||
"cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
|
||||
cudaArch,
|
||||
sharedMemSize,
|
||||
maxSharedMem - attr.sharedSizeBytes
|
||||
);
|
||||
return ncclSystemError;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Helper structure to hold test environment
|
||||
struct EnqueueTestEnvironment
|
||||
{
|
||||
ncclComm* comm;
|
||||
ncclInfo* info;
|
||||
void* sendbuff;
|
||||
void* recvbuff;
|
||||
static uint32_t abortFlag0, abortFlag1;
|
||||
static int abortFlagRefCount;
|
||||
void* sendbuff;
|
||||
void* recvbuff;
|
||||
uint32_t abortFlag0;
|
||||
uint32_t abortFlag1;
|
||||
int abortFlagRefCount;
|
||||
|
||||
void SetUp() override {
|
||||
EnqueueTestEnvironment()
|
||||
: comm(nullptr)
|
||||
, info(nullptr)
|
||||
, sendbuff(nullptr)
|
||||
, recvbuff(nullptr)
|
||||
, abortFlag0(0)
|
||||
, abortFlag1(0)
|
||||
, abortFlagRefCount(0)
|
||||
{}
|
||||
|
||||
~EnqueueTestEnvironment()
|
||||
{
|
||||
cleanup();
|
||||
}
|
||||
|
||||
void setup()
|
||||
{
|
||||
// Allocate GPU memory for buffers
|
||||
size_t bufferSize = 1024 * sizeof(float);
|
||||
hipError_t hipErr = hipMalloc(&sendbuff, bufferSize);
|
||||
size_t bufferSize = 1024 * sizeof(float);
|
||||
hipError_t hipErr = hipMalloc(&sendbuff, bufferSize);
|
||||
ASSERT_EQ(hipErr, hipSuccess) << "Failed to allocate sendbuff";
|
||||
|
||||
hipErr = hipMalloc(&recvbuff, bufferSize);
|
||||
@@ -34,17 +127,17 @@ protected:
|
||||
comm = new ncclComm();
|
||||
memset(comm, 0, sizeof(ncclComm));
|
||||
|
||||
comm->startMagic = NCCL_MAGIC; // 0x0280028002800280
|
||||
comm->startMagic = NCCL_MAGIC; // 0x0280028002800280
|
||||
|
||||
// Initialize critical fields
|
||||
comm->rank = 0;
|
||||
comm->nRanks = 2;
|
||||
comm->cudaDev = 0;
|
||||
comm->rank = 0;
|
||||
comm->nRanks = 2;
|
||||
comm->cudaDev = 0;
|
||||
comm->localRank = 0;
|
||||
|
||||
// Initialize abort flags
|
||||
comm->abortFlag = &abortFlag0;
|
||||
comm->childAbortFlag = &abortFlag1;
|
||||
comm->abortFlag = &abortFlag0;
|
||||
comm->childAbortFlag = &abortFlag1;
|
||||
comm->abortFlagRefCount = &abortFlagRefCount;
|
||||
|
||||
// Initialize memory stack
|
||||
@@ -53,15 +146,15 @@ protected:
|
||||
|
||||
// Initialize intra-communication pointers
|
||||
comm->intraComm0 = nullptr;
|
||||
comm->intraNext = nullptr;
|
||||
comm->intraNext = nullptr;
|
||||
|
||||
// Initialize work FIFO structures
|
||||
comm->workFifoBytes = 1024; // Power of 2
|
||||
comm->workFifoBuf = nullptr;
|
||||
comm->workFifoBufDev = nullptr;
|
||||
comm->workFifoConsumed = 0;
|
||||
comm->workFifoBytes = 1024; // Power of 2
|
||||
comm->workFifoBuf = nullptr;
|
||||
comm->workFifoBufDev = nullptr;
|
||||
comm->workFifoConsumed = 0;
|
||||
comm->workFifoProducedLastRecorded = 0;
|
||||
comm->workFifoProduced = 0;
|
||||
comm->workFifoProduced = 0;
|
||||
|
||||
// Initialize planner
|
||||
memset(&comm->planner, 0, sizeof(comm->planner));
|
||||
@@ -69,254 +162,556 @@ protected:
|
||||
// Initialize config
|
||||
memset(&comm->config, 0, sizeof(comm->config));
|
||||
comm->config.blocking = 1;
|
||||
comm->checkPointers = 0; // Disable pointer validation for easier testing
|
||||
comm->checkPointers = 0; // Disable pointer validation for easier testing
|
||||
|
||||
// Initialize peer info arrays
|
||||
comm->peerInfo = new ncclPeerInfo[comm->nRanks];
|
||||
memset(comm->peerInfo, 0, comm->nRanks * sizeof(ncclPeerInfo));
|
||||
|
||||
comm->localRankToRank = new int[comm->nRanks];
|
||||
for (int i = 0; i < comm->nRanks; i++) {
|
||||
for(int i = 0; i < comm->nRanks; i++)
|
||||
{
|
||||
comm->localRankToRank[i] = i;
|
||||
}
|
||||
|
||||
comm->endMagic = NCCL_MAGIC; // 0x0280028002800280
|
||||
comm->endMagic = NCCL_MAGIC; // 0x0280028002800280
|
||||
|
||||
// Initialize operation info with valid GPU buffers
|
||||
info = new ncclInfo();
|
||||
memset(info, 0, sizeof(ncclInfo));
|
||||
info->comm = comm;
|
||||
info->opName = "AllReduce";
|
||||
info->count = 1024;
|
||||
info->comm = comm;
|
||||
info->opName = "AllReduce";
|
||||
info->count = 1024;
|
||||
info->datatype = ncclFloat;
|
||||
info->op = ncclSum;
|
||||
info->root = 0;
|
||||
info->sendbuff = sendbuff; // Use allocated GPU memory
|
||||
info->recvbuff = recvbuff; // Use allocated GPU memory
|
||||
info->stream = nullptr;
|
||||
info->op = ncclSum;
|
||||
info->root = 0;
|
||||
info->sendbuff = sendbuff; // Use allocated GPU memory
|
||||
info->recvbuff = recvbuff; // Use allocated GPU memory
|
||||
info->stream = nullptr;
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
if (sendbuff) {
|
||||
hipFree(sendbuff);
|
||||
void cleanup()
|
||||
{
|
||||
// Clean up info first (it references comm)
|
||||
if(info)
|
||||
{
|
||||
delete info;
|
||||
info = nullptr;
|
||||
}
|
||||
if (recvbuff) {
|
||||
hipFree(recvbuff);
|
||||
}
|
||||
if (comm) {
|
||||
|
||||
// Clean up comm and its allocated resources
|
||||
if(comm)
|
||||
{
|
||||
// Clean up memory stacks
|
||||
ncclMemoryStackDestruct(&comm->memScoped);
|
||||
ncclMemoryStackDestruct(&comm->memPermanent);
|
||||
delete[] comm->peerInfo;
|
||||
delete[] comm->localRankToRank;
|
||||
|
||||
// Clean up peer info arrays
|
||||
if(comm->peerInfo)
|
||||
{
|
||||
delete[] comm->peerInfo;
|
||||
comm->peerInfo = nullptr;
|
||||
}
|
||||
|
||||
if(comm->localRankToRank)
|
||||
{
|
||||
delete[] comm->localRankToRank;
|
||||
comm->localRankToRank = nullptr;
|
||||
}
|
||||
|
||||
delete comm;
|
||||
comm = nullptr;
|
||||
}
|
||||
if (info) {
|
||||
delete info;
|
||||
|
||||
// Clean up GPU buffers last
|
||||
if(sendbuff)
|
||||
{
|
||||
hipError_t err = hipFree(sendbuff);
|
||||
if(err != hipSuccess)
|
||||
{
|
||||
// Log error but don't throw in cleanup
|
||||
fprintf(stderr, "Warning: hipFree(sendbuff) failed with error %d\n", err);
|
||||
}
|
||||
sendbuff = nullptr;
|
||||
}
|
||||
|
||||
if(recvbuff)
|
||||
{
|
||||
hipError_t err = hipFree(recvbuff);
|
||||
if(err != hipSuccess)
|
||||
{
|
||||
// Log error but don't throw in cleanup
|
||||
fprintf(stderr, "Warning: hipFree(recvbuff) failed with error %d\n", err);
|
||||
}
|
||||
recvbuff = nullptr;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Static member definitions
|
||||
uint32_t EnqueueTests::abortFlag0 = 0;
|
||||
uint32_t EnqueueTests::abortFlag1 = 0;
|
||||
int EnqueueTests::abortFlagRefCount = 0;
|
||||
// Empty test fixture for test organization
|
||||
class EnqueueTests : public ::testing::Test
|
||||
{
|
||||
// No setup/teardown - all tests use process isolation
|
||||
};
|
||||
|
||||
// Test ncclInitKernelsForDevice function
|
||||
TEST_F(EnqueueTests, ncclInitKernelsForDevice_ValidInput) {
|
||||
size_t maxStackSize = 0;
|
||||
ncclResult_t result = ncclInitKernelsForDevice(906, 65536, &maxStackSize);
|
||||
TEST_F(EnqueueTests, ncclInitKernelsForDevice_ValidInput)
|
||||
{
|
||||
ProcessIsolatedTestRunner::ExecutionOptions options;
|
||||
options.stopOnFirstFailure = false; // Continue running all tests
|
||||
options.verboseLogging = true;
|
||||
|
||||
EXPECT_TRUE(result == ncclSuccess);
|
||||
EXPECT_GT(maxStackSize, 0);
|
||||
RUN_ISOLATED_TESTS_WITH_OPTIONS(
|
||||
options,
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclInitKernelsForDevice_ValidInput",
|
||||
[this]()
|
||||
{
|
||||
size_t maxStackSize = 0;
|
||||
ncclResult_t result = ncclInitKernelsForDevice(906, 65536, &maxStackSize);
|
||||
|
||||
EXPECT_TRUE(result == ncclSuccess);
|
||||
// maxStackSize should be set to a reasonable value (> 0)
|
||||
EXPECT_GT(maxStackSize, 0)
|
||||
<< "Expected maxStackSize to be computed and set to a positive value";
|
||||
}
|
||||
).withEnvironment({{"NCCL_DEBUG", "INFO"}, {"NCCL_DEBUG_SUBSYS", "ALL"}}),
|
||||
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclInitKernelsForDevice_ValidInputCarveout",
|
||||
[this]()
|
||||
{
|
||||
size_t maxStackSize = 0;
|
||||
ncclResult_t result = ncclInitKernelsForDevice(906, 65536, &maxStackSize);
|
||||
|
||||
EXPECT_TRUE(result == ncclSuccess);
|
||||
// maxStackSize should be set to a reasonable value (> 0)
|
||||
EXPECT_GT(maxStackSize, 0)
|
||||
<< "Expected maxStackSize to be computed and set to a positive value";
|
||||
}
|
||||
)
|
||||
.withEnvironment(
|
||||
{{"NCCL_L1_SHARED_MEMORY_CARVEOUT", "1"},
|
||||
{"NCCL_DEBUG", "INFO"},
|
||||
{"NCCL_DEBUG_SUBSYS", "ALL"}}
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(EnqueueTests, ncclInitKernelsForDevice_NullStackSize) {
|
||||
ncclResult_t result = ncclInitKernelsForDevice(906, 65536, nullptr);
|
||||
TEST_F(EnqueueTests, ncclInitKernelsForDevice_NullStackSize)
|
||||
{
|
||||
ProcessIsolatedTestRunner::ExecutionOptions options;
|
||||
options.stopOnFirstFailure = false;
|
||||
options.verboseLogging = true;
|
||||
|
||||
EXPECT_EQ(result, ncclSuccess);
|
||||
RUN_ISOLATED_TESTS_WITH_OPTIONS(
|
||||
options,
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclInitKernelsForDevice_NullStackSize",
|
||||
[]()
|
||||
{
|
||||
ncclResult_t result = ncclInitKernelsForDevice(906, 65536, nullptr);
|
||||
EXPECT_EQ(result, ncclSuccess);
|
||||
}
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(EnqueueTests, ncclInitKernelsForDevice_InvalidArch) {
|
||||
size_t maxStackSize = 0;
|
||||
ncclResult_t result = ncclInitKernelsForDevice(-1, 65536, &maxStackSize);
|
||||
EXPECT_EQ(result, ncclSuccess);
|
||||
// Test with a real compiled kernel to verify attribute retrieval works correctly
|
||||
TEST_F(EnqueueTests, KernelAttributes_WithRealKernel)
|
||||
{
|
||||
ProcessIsolatedTestRunner::ExecutionOptions options;
|
||||
options.stopOnFirstFailure = false;
|
||||
options.verboseLogging = true;
|
||||
|
||||
RUN_ISOLATED_TESTS_WITH_OPTIONS(
|
||||
options,
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"KernelAttributes_WithRealKernel",
|
||||
[]()
|
||||
{
|
||||
size_t maxStackSize = 0;
|
||||
ncclResult_t result = testKernelAttributes((void*)simpleTestKernel, &maxStackSize);
|
||||
|
||||
EXPECT_EQ(result, ncclSuccess)
|
||||
<< "Expected successful kernel attribute retrieval with a real compiled kernel";
|
||||
}
|
||||
).withEnvironment({{"NCCL_DEBUG", "INFO"}})
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(EnqueueTests, ncclInitKernelsForDevice_ExceedsSharedMemory) {
|
||||
size_t maxStackSize = 0;
|
||||
TEST_F(EnqueueTests, ncclInitKernelsForDevice_InvalidArch)
|
||||
{
|
||||
ProcessIsolatedTestRunner::ExecutionOptions options;
|
||||
options.stopOnFirstFailure = false;
|
||||
options.verboseLogging = true;
|
||||
|
||||
ncclResult_t result = ncclInitKernelsForDevice(906, 32832, &maxStackSize);
|
||||
EXPECT_TRUE(result == ncclSystemError);
|
||||
RUN_ISOLATED_TESTS_WITH_OPTIONS(
|
||||
options,
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclInitKernelsForDevice_InvalidArch",
|
||||
[]()
|
||||
{
|
||||
size_t maxStackSize = 0;
|
||||
ncclResult_t result = ncclInitKernelsForDevice(-1, 65536, &maxStackSize);
|
||||
EXPECT_EQ(result, ncclSuccess);
|
||||
}
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(EnqueueTests, ncclInitKernelsForDevice_ExceedsSharedMemory)
|
||||
{
|
||||
ProcessIsolatedTestRunner::ExecutionOptions options;
|
||||
options.stopOnFirstFailure = false;
|
||||
options.verboseLogging = true;
|
||||
|
||||
RUN_ISOLATED_TESTS_WITH_OPTIONS(
|
||||
options,
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclInitKernelsForDevice_ExceedsSharedMemory",
|
||||
[]()
|
||||
{
|
||||
size_t maxStackSize = 0;
|
||||
// For gfx906, ncclMaxSharedMem is 32832 (as shown in test output)
|
||||
// Use a very small maxSharedMem (16000 bytes) to trigger the exceeds check
|
||||
ncclResult_t result = testKernelSharedMemoryLimit(
|
||||
(void*)simpleTestKernel, // Use our real compiled kernel
|
||||
906, // cudaArch
|
||||
16000, // maxSharedMem (intentionally too small)
|
||||
&maxStackSize,
|
||||
32832 // ncclMaxSharedMem for gfx906
|
||||
);
|
||||
|
||||
EXPECT_EQ(result, ncclSystemError)
|
||||
<< "Expected ncclSystemError when ncclMaxSharedMem exceeds maxSharedMem";
|
||||
}
|
||||
).withEnvironment({{"NCCL_DEBUG", "WARN"}})
|
||||
);
|
||||
}
|
||||
|
||||
// Test ncclEnqueueCheck function
|
||||
TEST_F(EnqueueTests, ncclEnqueueCheck_ValidInput) {
|
||||
ncclResult_t result = ncclEnqueueCheck(info);
|
||||
EXPECT_TRUE(result == ncclSuccess);
|
||||
TEST_F(EnqueueTests, ncclEnqueueCheck_ValidInput)
|
||||
{
|
||||
ProcessIsolatedTestRunner::ExecutionOptions options;
|
||||
options.stopOnFirstFailure = false;
|
||||
options.verboseLogging = true;
|
||||
|
||||
RUN_ISOLATED_TESTS_WITH_OPTIONS(
|
||||
options,
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclEnqueueCheck_ValidInput",
|
||||
[]()
|
||||
{
|
||||
EnqueueTestEnvironment env;
|
||||
env.setup();
|
||||
ncclResult_t result = ncclEnqueueCheck(env.info);
|
||||
EXPECT_TRUE(result == ncclSuccess);
|
||||
env.cleanup();
|
||||
}
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(EnqueueTests, ncclEnqueueCheck_InvalidComm) {
|
||||
info->comm = nullptr;
|
||||
ncclResult_t result = ncclEnqueueCheck(info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument);
|
||||
TEST_F(EnqueueTests, ncclEnqueueCheck_InvalidComm)
|
||||
{
|
||||
ProcessIsolatedTestRunner::ExecutionOptions options;
|
||||
options.stopOnFirstFailure = false;
|
||||
options.verboseLogging = true;
|
||||
|
||||
RUN_ISOLATED_TESTS_WITH_OPTIONS(
|
||||
options,
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclEnqueueCheck_InvalidComm",
|
||||
[]()
|
||||
{
|
||||
EnqueueTestEnvironment env;
|
||||
env.setup();
|
||||
env.info->comm = nullptr;
|
||||
ncclResult_t result = ncclEnqueueCheck(env.info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument);
|
||||
env.cleanup();
|
||||
}
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(EnqueueTests, ncclEnqueueCheck_InvalidBuffers) {
|
||||
// Test with null sendbuff
|
||||
comm->checkPointers = 1;
|
||||
info->sendbuff = nullptr;
|
||||
ncclResult_t result = ncclEnqueueCheck(info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument);
|
||||
TEST_F(EnqueueTests, ncclEnqueueCheck_InvalidBuffers)
|
||||
{
|
||||
ProcessIsolatedTestRunner::ExecutionOptions options;
|
||||
options.stopOnFirstFailure = false;
|
||||
options.verboseLogging = true;
|
||||
|
||||
// Reset sendbuff and test with null recvbuff
|
||||
info->sendbuff = sendbuff;
|
||||
info->recvbuff = nullptr;
|
||||
result = ncclEnqueueCheck(info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument);
|
||||
RUN_ISOLATED_TESTS_WITH_OPTIONS(
|
||||
options,
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclEnqueueCheck_InvalidBuffers",
|
||||
[]()
|
||||
{
|
||||
EnqueueTestEnvironment env;
|
||||
env.setup();
|
||||
|
||||
// Test with null sendbuff
|
||||
env.comm->checkPointers = 1;
|
||||
env.info->sendbuff = nullptr;
|
||||
ncclResult_t result = ncclEnqueueCheck(env.info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument);
|
||||
|
||||
// Reset sendbuff and test with null recvbuff
|
||||
env.info->sendbuff = env.sendbuff;
|
||||
env.info->recvbuff = nullptr;
|
||||
result = ncclEnqueueCheck(env.info);
|
||||
EXPECT_EQ(result, ncclInvalidArgument);
|
||||
|
||||
env.cleanup();
|
||||
}
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
// Test ncclFuncSendCount function
|
||||
TEST_F(EnqueueTests, ncclFuncSendCount_AllReduce) {
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
TEST_F(EnqueueTests, ncclFuncSendCount_AllTests)
|
||||
{
|
||||
ProcessIsolatedTestRunner::ExecutionOptions options;
|
||||
options.stopOnFirstFailure = false;
|
||||
options.verboseLogging = true;
|
||||
|
||||
size_t result = ncclFuncSendCount(ncclFuncAllReduce, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
RUN_ISOLATED_TESTS_WITH_OPTIONS(
|
||||
options,
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncSendCount_AllReduce",
|
||||
[]()
|
||||
{
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
size_t result = ncclFuncSendCount(ncclFuncAllReduce, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
),
|
||||
|
||||
TEST_F(EnqueueTests, ncclFuncSendCount_Broadcast) {
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncSendCount_Broadcast",
|
||||
[]()
|
||||
{
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
size_t result = ncclFuncSendCount(ncclFuncBroadcast, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
),
|
||||
|
||||
size_t result = ncclFuncSendCount(ncclFuncBroadcast, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncSendCount_Reduce",
|
||||
[]()
|
||||
{
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
size_t result = ncclFuncSendCount(ncclFuncReduce, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
),
|
||||
|
||||
TEST_F(EnqueueTests, ncclFuncSendCount_Reduce) {
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncSendCount_AllGather",
|
||||
[]()
|
||||
{
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
size_t result = ncclFuncSendCount(ncclFuncAllGather, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
),
|
||||
|
||||
size_t result = ncclFuncSendCount(ncclFuncReduce, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncSendCount_ReduceScatter",
|
||||
[]()
|
||||
{
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
size_t result = ncclFuncSendCount(ncclFuncReduceScatter, nRanks, count);
|
||||
EXPECT_EQ(result, count * nRanks);
|
||||
}
|
||||
),
|
||||
|
||||
TEST_F(EnqueueTests, ncclFuncSendCount_AllGather) {
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
|
||||
size_t result = ncclFuncSendCount(ncclFuncAllGather, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
|
||||
TEST_F(EnqueueTests, ncclFuncSendCount_ReduceScatter) {
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
|
||||
size_t result = ncclFuncSendCount(ncclFuncReduceScatter, nRanks, count);
|
||||
EXPECT_EQ(result, count * nRanks);
|
||||
}
|
||||
|
||||
TEST_F(EnqueueTests, ncclFuncSendCount_ZeroCount) {
|
||||
size_t result = ncclFuncSendCount(ncclFuncAllReduce, 4, 0);
|
||||
EXPECT_EQ(result, 0);
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncSendCount_ZeroCount",
|
||||
[]()
|
||||
{
|
||||
size_t result = ncclFuncSendCount(ncclFuncAllReduce, 4, 0);
|
||||
EXPECT_EQ(result, 0);
|
||||
}
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
// Test ncclFuncRecvCount function
|
||||
TEST_F(EnqueueTests, ncclFuncRecvCount_AllReduce) {
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
TEST_F(EnqueueTests, ncclFuncRecvCount_AllTests)
|
||||
{
|
||||
ProcessIsolatedTestRunner::ExecutionOptions options;
|
||||
options.stopOnFirstFailure = false;
|
||||
options.verboseLogging = true;
|
||||
|
||||
size_t result = ncclFuncRecvCount(ncclFuncAllReduce, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
RUN_ISOLATED_TESTS_WITH_OPTIONS(
|
||||
options,
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncRecvCount_AllReduce",
|
||||
[]()
|
||||
{
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
size_t result = ncclFuncRecvCount(ncclFuncAllReduce, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
),
|
||||
|
||||
TEST_F(EnqueueTests, ncclFuncRecvCount_Broadcast) {
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncRecvCount_Broadcast",
|
||||
[]()
|
||||
{
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
size_t result = ncclFuncRecvCount(ncclFuncBroadcast, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
),
|
||||
|
||||
size_t result = ncclFuncRecvCount(ncclFuncBroadcast, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncRecvCount_Reduce",
|
||||
[]()
|
||||
{
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
size_t result = ncclFuncRecvCount(ncclFuncReduce, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
),
|
||||
|
||||
TEST_F(EnqueueTests, ncclFuncRecvCount_Reduce) {
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncRecvCount_AllGather",
|
||||
[]()
|
||||
{
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
size_t result = ncclFuncRecvCount(ncclFuncAllGather, nRanks, count);
|
||||
EXPECT_EQ(result, count * nRanks);
|
||||
}
|
||||
),
|
||||
|
||||
size_t result = ncclFuncRecvCount(ncclFuncReduce, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncRecvCount_ReduceScatter",
|
||||
[]()
|
||||
{
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
size_t result = ncclFuncRecvCount(ncclFuncReduceScatter, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
),
|
||||
|
||||
TEST_F(EnqueueTests, ncclFuncRecvCount_AllGather) {
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
|
||||
size_t result = ncclFuncRecvCount(ncclFuncAllGather, nRanks, count);
|
||||
EXPECT_EQ(result, count * nRanks);
|
||||
}
|
||||
|
||||
TEST_F(EnqueueTests, ncclFuncRecvCount_ReduceScatter) {
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
|
||||
size_t result = ncclFuncRecvCount(ncclFuncReduceScatter, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
|
||||
TEST_F(EnqueueTests, ncclFuncRecvCount_ZeroCount) {
|
||||
size_t result = ncclFuncRecvCount(ncclFuncAllReduce, 4, 0);
|
||||
EXPECT_EQ(result, 0);
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncRecvCount_ZeroCount",
|
||||
[]()
|
||||
{
|
||||
size_t result = ncclFuncRecvCount(ncclFuncAllReduce, 4, 0);
|
||||
EXPECT_EQ(result, 0);
|
||||
}
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
// Test ncclFuncMaxSendRecvCount function
|
||||
TEST_F(EnqueueTests, ncclFuncMaxSendRecvCount_AllReduce) {
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
TEST_F(EnqueueTests, ncclFuncMaxSendRecvCount_AllTests)
|
||||
{
|
||||
ProcessIsolatedTestRunner::ExecutionOptions options;
|
||||
options.stopOnFirstFailure = false;
|
||||
options.verboseLogging = true;
|
||||
|
||||
size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllReduce, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
RUN_ISOLATED_TESTS_WITH_OPTIONS(
|
||||
options,
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncMaxSendRecvCount_AllReduce",
|
||||
[]()
|
||||
{
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllReduce, nRanks, count);
|
||||
EXPECT_EQ(result, count);
|
||||
}
|
||||
),
|
||||
|
||||
TEST_F(EnqueueTests, ncclFuncMaxSendRecvCount_AllGather) {
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncMaxSendRecvCount_AllGather",
|
||||
[]()
|
||||
{
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllGather, nRanks, count);
|
||||
// For AllGather, receive count (count * nRanks) is larger than send count (count)
|
||||
EXPECT_EQ(result, count * nRanks);
|
||||
}
|
||||
),
|
||||
|
||||
size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllGather, nRanks, count);
|
||||
// For AllGather, receive count (count * nRanks) is larger than send count (count)
|
||||
EXPECT_EQ(result, count * nRanks);
|
||||
}
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncMaxSendRecvCount_ReduceScatter",
|
||||
[]()
|
||||
{
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
size_t result = ncclFuncMaxSendRecvCount(ncclFuncReduceScatter, nRanks, count);
|
||||
// For ReduceScatter, send count (count) is larger than receive count (count/nRanks)
|
||||
EXPECT_EQ(result, count * nRanks);
|
||||
}
|
||||
),
|
||||
|
||||
TEST_F(EnqueueTests, ncclFuncMaxSendRecvCount_ReduceScatter) {
|
||||
size_t count = 1000;
|
||||
int nRanks = 4;
|
||||
|
||||
size_t result = ncclFuncMaxSendRecvCount(ncclFuncReduceScatter, nRanks, count);
|
||||
// For ReduceScatter, send count (count) is larger than receive count (count/nRanks)
|
||||
EXPECT_EQ(result, count * nRanks);
|
||||
}
|
||||
|
||||
TEST_F(EnqueueTests, ncclFuncMaxSendRecvCount_ZeroCount) {
|
||||
size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllReduce, 4, 0);
|
||||
EXPECT_EQ(result, 0);
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncMaxSendRecvCount_ZeroCount",
|
||||
[]()
|
||||
{
|
||||
size_t result = ncclFuncMaxSendRecvCount(ncclFuncAllReduce, 4, 0);
|
||||
EXPECT_EQ(result, 0);
|
||||
}
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
// Edge case tests
|
||||
TEST_F(EnqueueTests, ncclFuncCounts_SingleRank) {
|
||||
size_t count = 1000;
|
||||
int nRanks = 1;
|
||||
TEST_F(EnqueueTests, ncclFuncCounts_EdgeCases)
|
||||
{
|
||||
ProcessIsolatedTestRunner::ExecutionOptions options;
|
||||
options.stopOnFirstFailure = false;
|
||||
options.verboseLogging = true;
|
||||
|
||||
// Test with single rank
|
||||
EXPECT_EQ(ncclFuncSendCount(ncclFuncAllReduce, nRanks, count), count);
|
||||
EXPECT_EQ(ncclFuncRecvCount(ncclFuncAllReduce, nRanks, count), count);
|
||||
EXPECT_EQ(ncclFuncMaxSendRecvCount(ncclFuncAllReduce, nRanks, count), count);
|
||||
RUN_ISOLATED_TESTS_WITH_OPTIONS(
|
||||
options,
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncCounts_SingleRank",
|
||||
[]()
|
||||
{
|
||||
size_t count = 1000;
|
||||
int nRanks = 1;
|
||||
// Test with single rank
|
||||
EXPECT_EQ(ncclFuncSendCount(ncclFuncAllReduce, nRanks, count), count);
|
||||
EXPECT_EQ(ncclFuncRecvCount(ncclFuncAllReduce, nRanks, count), count);
|
||||
EXPECT_EQ(ncclFuncMaxSendRecvCount(ncclFuncAllReduce, nRanks, count), count);
|
||||
}
|
||||
),
|
||||
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"ncclFuncCounts_LargeRankCount",
|
||||
[]()
|
||||
{
|
||||
size_t count = 1000;
|
||||
int nRanks = 1024;
|
||||
// Test with large number of ranks
|
||||
EXPECT_EQ(ncclFuncSendCount(ncclFuncAllGather, nRanks, count), count);
|
||||
EXPECT_EQ(ncclFuncRecvCount(ncclFuncAllGather, nRanks, count), count * nRanks);
|
||||
EXPECT_EQ(
|
||||
ncclFuncMaxSendRecvCount(ncclFuncAllGather, nRanks, count),
|
||||
count * nRanks
|
||||
);
|
||||
}
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
TEST_F(EnqueueTests, ncclFuncCounts_LargeRankCount) {
|
||||
size_t count = 1000;
|
||||
int nRanks = 1024;
|
||||
|
||||
// Test with large number of ranks
|
||||
EXPECT_EQ(ncclFuncSendCount(ncclFuncAllGather, nRanks, count), count);
|
||||
EXPECT_EQ(ncclFuncRecvCount(ncclFuncAllGather, nRanks, count), count * nRanks);
|
||||
EXPECT_EQ(ncclFuncMaxSendRecvCount(ncclFuncAllGather, nRanks, count), count * nRanks);
|
||||
}
|
||||
} // namespace RcclUnitTesting
|
||||
@@ -3,8 +3,8 @@
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "net.h"
|
||||
#include "common/ProcessIsolatedTestRunner.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
#include <atomic>
|
||||
#include <cstring>
|
||||
@@ -612,6 +612,169 @@ protected:
|
||||
return static_cast<int>(result);
|
||||
}
|
||||
|
||||
void RunConcurrentOperationsTaskCreationWithEnvVars() {
|
||||
INFO(NCCL_LOG_INFO, "Checking socket configuration environment variables");
|
||||
|
||||
// Check if the required environment variables are set
|
||||
const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS");
|
||||
const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD");
|
||||
|
||||
if (!nThreadsEnv || !nSocksPerThreadEnv) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Required environment variables not set. "
|
||||
<< "Please set the following environment variables to run this test: "
|
||||
<< "export NCCL_SOCKET_NTHREADS=1 and export NCCL_NSOCKS_PERTHREAD=2. "
|
||||
<< "This ensures nSocks > 0 so that ncclNetSocketGetTask gets called. "
|
||||
<< "Environment variables NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be set";
|
||||
return;
|
||||
}
|
||||
|
||||
int nThreads = ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1);
|
||||
int nSocksPerThread = ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1);
|
||||
|
||||
// Additional validation for reasonable upper bounds
|
||||
const int MAX_THREADS = 16;
|
||||
const int MAX_SOCKS_PER_THREAD = 64;
|
||||
const int MAX_TOTAL_SOCKETS = 64;
|
||||
|
||||
if (nThreads > MAX_THREADS) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: NCCL_SOCKET_NTHREADS=" << nThreads << " exceeds maximum " << MAX_THREADS << ". "
|
||||
<< "Please provide a reasonable value (e.g., NCCL_SOCKET_NTHREADS=8). "
|
||||
<< "Values too large may cause resource exhaustion.";
|
||||
return;
|
||||
}
|
||||
|
||||
if (nSocksPerThread > MAX_SOCKS_PER_THREAD) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread << " exceeds maximum " << MAX_SOCKS_PER_THREAD << ". "
|
||||
<< "Please provide a reasonable value (e.g., NCCL_NSOCKS_PERTHREAD=4). "
|
||||
<< "Values too large may cause resource exhaustion.";
|
||||
return;
|
||||
}
|
||||
|
||||
// Check for potential overflow before multiplication
|
||||
if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Configuration would cause integer overflow. "
|
||||
<< "NCCL_SOCKET_NTHREADS=" << nThreads << " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread
|
||||
<< " exceeds maximum integer value. Please use smaller values.";
|
||||
return;
|
||||
}
|
||||
|
||||
int totalSockets = nThreads * nSocksPerThread;
|
||||
|
||||
INFO(NCCL_LOG_INFO, "Environment configuration found:");
|
||||
INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads);
|
||||
INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread);
|
||||
INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets);
|
||||
|
||||
// Validate total sockets count
|
||||
if (totalSockets <= 0) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Invalid configuration - total sockets must be > 0. "
|
||||
<< "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread
|
||||
<< " = " << totalSockets << ". "
|
||||
<< "Both NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be positive integers. "
|
||||
<< "Example: export NCCL_SOCKET_NTHREADS=2 && export NCCL_NSOCKS_PERTHREAD=2";
|
||||
return;
|
||||
}
|
||||
|
||||
if (totalSockets > MAX_TOTAL_SOCKETS) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Total sockets " << totalSockets << " exceeds maximum " << MAX_TOTAL_SOCKETS << ". "
|
||||
<< "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread
|
||||
<< " = " << totalSockets << ". "
|
||||
<< "Please reduce either NCCL_SOCKET_NTHREADS or NCCL_NSOCKS_PERTHREAD. "
|
||||
<< "Example: export NCCL_SOCKET_NTHREADS=8 && export NCCL_NSOCKS_PERTHREAD=4";
|
||||
return;
|
||||
}
|
||||
|
||||
if (totalSockets > NCCL_NET_MAX_REQUESTS) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Total sockets " << totalSockets << " exceeds NCCL_NET_MAX_REQUESTS=" << NCCL_NET_MAX_REQUESTS << ". "
|
||||
<< "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread
|
||||
<< " = " << totalSockets << ". "
|
||||
<< "NCCL network layer can handle at most " << NCCL_NET_MAX_REQUESTS << " concurrent requests. "
|
||||
<< "Please reduce configuration to stay within NCCL limits.";
|
||||
return;
|
||||
}
|
||||
|
||||
INFO(NCCL_LOG_INFO, "Configuration valid - proceeding with test to exercise "
|
||||
"ncclNetSocketGetTask");
|
||||
|
||||
// Test socket properties
|
||||
TestSocketProperties();
|
||||
|
||||
char handle[NCCL_NET_HANDLE_MAXSIZE];
|
||||
void *listenComm = nullptr;
|
||||
|
||||
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
|
||||
ASSERT_EQ(result, ncclSuccess) << "Failed to establish listening socket for test execution. "
|
||||
<< "ncclNetSocket.listen() returned error code: " << result
|
||||
<< ". Verify network device availability and port accessibility.";
|
||||
|
||||
INFO(NCCL_LOG_INFO, "Testing task creation functionality - ensuring "
|
||||
"ncclNetSocketGetTask is called");
|
||||
|
||||
std::vector<void *> sendComms;
|
||||
std::vector<void *> recvComms;
|
||||
|
||||
// Establish connection
|
||||
void *sendComm = nullptr;
|
||||
void *recvComm = nullptr;
|
||||
bool connectionSuccess =
|
||||
EstablishConnectionPair(handle, listenComm, sendComm, recvComm);
|
||||
|
||||
if (connectionSuccess) {
|
||||
sendComms.push_back(sendComm);
|
||||
recvComms.push_back(recvComm);
|
||||
|
||||
// Test with buffer sizes that will trigger task subdivision
|
||||
std::vector<size_t> testSizes = GetTestSizes();
|
||||
|
||||
for (size_t testSize : testSizes) {
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"\n=== Testing with buffer size: %zu bytes ===", testSize);
|
||||
INFO(NCCL_LOG_INFO, "This should trigger ncclNetSocketGetTask to create "
|
||||
"task subdivision");
|
||||
|
||||
std::vector<void *> sendMhandles;
|
||||
std::vector<void *> recvMhandles;
|
||||
std::vector<void *> sendRequests;
|
||||
std::vector<void *> recvRequests;
|
||||
std::vector<std::vector<char>> sendBuffers;
|
||||
std::vector<std::vector<char>> recvBuffers;
|
||||
|
||||
// Setup operations for this test size
|
||||
bool setupSuccess = SetupOperationsForSize(
|
||||
sendComm, recvComm, testSize, sendBuffers, recvBuffers, sendMhandles,
|
||||
recvMhandles, sendRequests, recvRequests, 0xAB);
|
||||
|
||||
if (setupSuccess) {
|
||||
// Progress operations with context about environment variables
|
||||
ProgressOperations(sendRequests[0], recvRequests[0], testSize,
|
||||
" (with nSocks > 0 from environment variables)");
|
||||
} else {
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"No operations started - skipping progress testing for size %zu",
|
||||
testSize);
|
||||
}
|
||||
|
||||
// Deregister memory
|
||||
DeregisterMemory(sendComm, recvComm, sendMhandles, recvMhandles,
|
||||
testSize);
|
||||
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"=== Completed testing for buffer size: %zu bytes ===", testSize);
|
||||
}
|
||||
|
||||
INFO(NCCL_LOG_INFO, "\n*** TEST SUCCESS: ncclNetSocketGetTask was "
|
||||
"successfully exercised! ***");
|
||||
} else {
|
||||
INFO(NCCL_LOG_INFO, "No connections established - test passed (network may "
|
||||
"not be available)");
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
CleanupCommunicators(sendComms, recvComms, listenComm);
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"TestConcurrentOperationsTaskCreation completed successfully");
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
// Test concurrent operations task creation in default configuration (without
|
||||
@@ -709,166 +872,19 @@ TEST_F(NetSocketTests, TestConcurrentOperationsTaskCreationDefault) {
|
||||
|
||||
// Test multiple concurrent operations to stress test task creation
|
||||
TEST_F(NetSocketTests, TestConcurrentOperationsTaskCreation) {
|
||||
INFO(NCCL_LOG_INFO, "Checking socket configuration environment variables");
|
||||
ProcessIsolatedTestRunner::ExecutionOptions options;
|
||||
options.stopOnFirstFailure = false; // Continue running all tests
|
||||
options.verboseLogging = true;
|
||||
|
||||
// Check if the required environment variables are set
|
||||
const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS");
|
||||
const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD");
|
||||
|
||||
if (!nThreadsEnv || !nSocksPerThreadEnv) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Required environment variables not set. "
|
||||
<< "Please set the following environment variables to run this test: "
|
||||
<< "export NCCL_SOCKET_NTHREADS=1 and export NCCL_NSOCKS_PERTHREAD=2. "
|
||||
<< "This ensures nSocks > 0 so that ncclNetSocketGetTask gets called. "
|
||||
<< "Environment variables NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be set";
|
||||
return;
|
||||
}
|
||||
|
||||
int nThreads = ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1);
|
||||
int nSocksPerThread = ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1);
|
||||
|
||||
// Additional validation for reasonable upper bounds
|
||||
const int MAX_THREADS = 16;
|
||||
const int MAX_SOCKS_PER_THREAD = 64;
|
||||
const int MAX_TOTAL_SOCKETS = 64;
|
||||
|
||||
if (nThreads > MAX_THREADS) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: NCCL_SOCKET_NTHREADS=" << nThreads << " exceeds maximum " << MAX_THREADS << ". "
|
||||
<< "Please provide a reasonable value (e.g., NCCL_SOCKET_NTHREADS=8). "
|
||||
<< "Values too large may cause resource exhaustion.";
|
||||
return;
|
||||
}
|
||||
|
||||
if (nSocksPerThread > MAX_SOCKS_PER_THREAD) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread << " exceeds maximum " << MAX_SOCKS_PER_THREAD << ". "
|
||||
<< "Please provide a reasonable value (e.g., NCCL_NSOCKS_PERTHREAD=4). "
|
||||
<< "Values too large may cause resource exhaustion.";
|
||||
return;
|
||||
}
|
||||
|
||||
// Check for potential overflow before multiplication
|
||||
if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Configuration would cause integer overflow. "
|
||||
<< "NCCL_SOCKET_NTHREADS=" << nThreads << " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread
|
||||
<< " exceeds maximum integer value. Please use smaller values.";
|
||||
return;
|
||||
}
|
||||
|
||||
int totalSockets = nThreads * nSocksPerThread;
|
||||
|
||||
INFO(NCCL_LOG_INFO, "Environment configuration found:");
|
||||
INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads);
|
||||
INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread);
|
||||
INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets);
|
||||
|
||||
// Validate total sockets count
|
||||
if (totalSockets <= 0) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Invalid configuration - total sockets must be > 0. "
|
||||
<< "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread
|
||||
<< " = " << totalSockets << ". "
|
||||
<< "Both NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be positive integers. "
|
||||
<< "Example: export NCCL_SOCKET_NTHREADS=2 && export NCCL_NSOCKS_PERTHREAD=2";
|
||||
return;
|
||||
}
|
||||
|
||||
if (totalSockets > MAX_TOTAL_SOCKETS) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Total sockets " << totalSockets << " exceeds maximum " << MAX_TOTAL_SOCKETS << ". "
|
||||
<< "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread
|
||||
<< " = " << totalSockets << ". "
|
||||
<< "Please reduce either NCCL_SOCKET_NTHREADS or NCCL_NSOCKS_PERTHREAD. "
|
||||
<< "Example: export NCCL_SOCKET_NTHREADS=8 && export NCCL_NSOCKS_PERTHREAD=4";
|
||||
return;
|
||||
}
|
||||
|
||||
if (totalSockets > NCCL_NET_MAX_REQUESTS) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Total sockets " << totalSockets << " exceeds NCCL_NET_MAX_REQUESTS=" << NCCL_NET_MAX_REQUESTS << ". "
|
||||
<< "Current configuration: nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread
|
||||
<< " = " << totalSockets << ". "
|
||||
<< "NCCL network layer can handle at most " << NCCL_NET_MAX_REQUESTS << " concurrent requests. "
|
||||
<< "Please reduce configuration to stay within NCCL limits.";
|
||||
return;
|
||||
}
|
||||
|
||||
INFO(NCCL_LOG_INFO, "Configuration valid - proceeding with test to exercise "
|
||||
"ncclNetSocketGetTask");
|
||||
|
||||
// Test socket properties
|
||||
TestSocketProperties();
|
||||
|
||||
char handle[NCCL_NET_HANDLE_MAXSIZE];
|
||||
void *listenComm = nullptr;
|
||||
|
||||
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
|
||||
ASSERT_EQ(result, ncclSuccess) << "Failed to establish listening socket for test execution. "
|
||||
<< "ncclNetSocket.listen() returned error code: " << result
|
||||
<< ". Verify network device availability and port accessibility.";
|
||||
|
||||
INFO(NCCL_LOG_INFO, "Testing task creation functionality - ensuring "
|
||||
"ncclNetSocketGetTask is called");
|
||||
|
||||
std::vector<void *> sendComms;
|
||||
std::vector<void *> recvComms;
|
||||
|
||||
// Establish connection
|
||||
void *sendComm = nullptr;
|
||||
void *recvComm = nullptr;
|
||||
bool connectionSuccess =
|
||||
EstablishConnectionPair(handle, listenComm, sendComm, recvComm);
|
||||
|
||||
if (connectionSuccess) {
|
||||
sendComms.push_back(sendComm);
|
||||
recvComms.push_back(recvComm);
|
||||
|
||||
// Test with buffer sizes that will trigger task subdivision
|
||||
std::vector<size_t> testSizes = GetTestSizes();
|
||||
|
||||
for (size_t testSize : testSizes) {
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"\n=== Testing with buffer size: %zu bytes ===", testSize);
|
||||
INFO(NCCL_LOG_INFO, "This should trigger ncclNetSocketGetTask to create "
|
||||
"task subdivision");
|
||||
|
||||
std::vector<void *> sendMhandles;
|
||||
std::vector<void *> recvMhandles;
|
||||
std::vector<void *> sendRequests;
|
||||
std::vector<void *> recvRequests;
|
||||
std::vector<std::vector<char>> sendBuffers;
|
||||
std::vector<std::vector<char>> recvBuffers;
|
||||
|
||||
// Setup operations for this test size
|
||||
bool setupSuccess = SetupOperationsForSize(
|
||||
sendComm, recvComm, testSize, sendBuffers, recvBuffers, sendMhandles,
|
||||
recvMhandles, sendRequests, recvRequests, 0xAB);
|
||||
|
||||
if (setupSuccess) {
|
||||
// Progress operations with context about environment variables
|
||||
ProgressOperations(sendRequests[0], recvRequests[0], testSize,
|
||||
" (with nSocks > 0 from environment variables)");
|
||||
} else {
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"No operations started - skipping progress testing for size %zu",
|
||||
testSize);
|
||||
}
|
||||
|
||||
// Deregister memory
|
||||
DeregisterMemory(sendComm, recvComm, sendMhandles, recvMhandles,
|
||||
testSize);
|
||||
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"=== Completed testing for buffer size: %zu bytes ===", testSize);
|
||||
}
|
||||
|
||||
INFO(NCCL_LOG_INFO, "\n*** TEST SUCCESS: ncclNetSocketGetTask was "
|
||||
"successfully exercised! ***");
|
||||
} else {
|
||||
INFO(NCCL_LOG_INFO, "No connections established - test passed (network may "
|
||||
"not be available)");
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
CleanupCommunicators(sendComms, recvComms, listenComm);
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"TestConcurrentOperationsTaskCreation completed successfully");
|
||||
RUN_ISOLATED_TESTS_WITH_OPTIONS(options,
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"TestConcurrentOperationsTaskCreation",
|
||||
[this]() { RunConcurrentOperationsTaskCreationWithEnvVars(); })
|
||||
.withEnvironment({{"NCCL_SOCKET_NTHREADS", "1"},
|
||||
{"NCCL_NSOCKS_PERTHREAD", "2"},
|
||||
{"NCCL_DEBUG", "TRACE"},
|
||||
{"NCCL_DEBUG_SUBSYS", "ALL"}})
|
||||
);
|
||||
}
|
||||
|
||||
// Test for invalid device index in listen function
|
||||
@@ -1079,158 +1095,239 @@ TEST_F(NetSocketTests, TestNonHostMemoryRegMr) {
|
||||
|
||||
// Test for excessive thread configuration warning
|
||||
TEST_F(NetSocketTests, TestExcessiveThreadConfig) {
|
||||
INFO(NCCL_LOG_INFO, "Testing excessive thread configuration warning");
|
||||
ProcessIsolatedTestRunner::ExecutionOptions options;
|
||||
options.stopOnFirstFailure = false; // Continue running all tests
|
||||
options.verboseLogging = true;
|
||||
|
||||
// Check if the required environment variables are set
|
||||
const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS");
|
||||
const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD");
|
||||
RUN_ISOLATED_TESTS_WITH_OPTIONS(options,
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"TestExcessiveThreadConfig",
|
||||
[this]() {
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"Testing excessive thread configuration warning");
|
||||
|
||||
if (!nThreadsEnv || !nSocksPerThreadEnv) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Required environment variables not set. "
|
||||
<< "This test requires NCCL_SOCKET_NTHREADS > NCCL_NET_MAX_REQUESTS (" << NCCL_NET_MAX_REQUESTS << ") and NCCL_NSOCKS_PERTHREAD = 1 to trigger warning. "
|
||||
<< "Environment variables NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be set";
|
||||
return;
|
||||
}
|
||||
// Check if the required environment variables are set
|
||||
const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS");
|
||||
const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD");
|
||||
|
||||
// Parse with validation - both must be positive
|
||||
int nThreads = ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1);
|
||||
int nSocksPerThread = ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1);
|
||||
if (!nThreadsEnv || !nSocksPerThreadEnv) {
|
||||
GTEST_SKIP()
|
||||
<< "SKIPPING TEST: Required environment variables not set. "
|
||||
<< "This test requires NCCL_SOCKET_NTHREADS > "
|
||||
"NCCL_NET_MAX_REQUESTS ("
|
||||
<< NCCL_NET_MAX_REQUESTS
|
||||
<< ") and NCCL_NSOCKS_PERTHREAD = 1 to trigger warning. "
|
||||
<< "Environment variables NCCL_SOCKET_NTHREADS and "
|
||||
"NCCL_NSOCKS_PERTHREAD must be set";
|
||||
return;
|
||||
}
|
||||
|
||||
// Check for potential overflow before multiplication
|
||||
if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Configuration would cause integer overflow. "
|
||||
<< "NCCL_SOCKET_NTHREADS=" << nThreads << " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread
|
||||
<< " exceeds maximum integer value. Please use smaller values.";
|
||||
return;
|
||||
}
|
||||
// Parse with validation - both must be positive
|
||||
int nThreads =
|
||||
ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1);
|
||||
int nSocksPerThread =
|
||||
ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1);
|
||||
|
||||
int totalSockets = nThreads * nSocksPerThread;
|
||||
// Check for potential overflow before multiplication
|
||||
if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Configuration would cause "
|
||||
"integer overflow. "
|
||||
<< "NCCL_SOCKET_NTHREADS=" << nThreads
|
||||
<< " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread
|
||||
<< " exceeds maximum integer value. Please use "
|
||||
"smaller values.";
|
||||
return;
|
||||
}
|
||||
|
||||
INFO(NCCL_LOG_INFO, "Environment configuration found:");
|
||||
INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads);
|
||||
INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread);
|
||||
INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets);
|
||||
int totalSockets = nThreads * nSocksPerThread;
|
||||
|
||||
// Check if configuration is set to trigger the excessive threads warning
|
||||
// Use NCCL_NET_MAX_REQUESTS instead of arbitrary MAX_THREADS
|
||||
if (nThreads <= NCCL_NET_MAX_REQUESTS) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: NCCL_SOCKET_NTHREADS must be > " << NCCL_NET_MAX_REQUESTS << " to test excessive thread warning. "
|
||||
<< "Current NCCL_SOCKET_NTHREADS=" << nThreads << ". "
|
||||
<< "Please set: export NCCL_SOCKET_NTHREADS=" << (NCCL_NET_MAX_REQUESTS + 1) << ". "
|
||||
<< "NCCL_SOCKET_NTHREADS must be > NCCL_NET_MAX_REQUESTS (" << NCCL_NET_MAX_REQUESTS << ") to trigger warning";
|
||||
return;
|
||||
}
|
||||
INFO(NCCL_LOG_INFO, "Environment configuration found:");
|
||||
INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads);
|
||||
INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread);
|
||||
INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets);
|
||||
|
||||
if (totalSockets > NCCL_NET_MAX_REQUESTS * 10) { // Allow 10x for testing excessive config
|
||||
GTEST_SKIP() << "SKIPPING TEST: Total sockets=" << totalSockets << " is unreasonably large (> " << (NCCL_NET_MAX_REQUESTS * 10) << "). "
|
||||
<< "Please use more reasonable values for testing. NCCL_NET_MAX_REQUESTS=" << NCCL_NET_MAX_REQUESTS << ". "
|
||||
<< "Example: export NCCL_SOCKET_NTHREADS=" << (NCCL_NET_MAX_REQUESTS + 1) << " && export NCCL_NSOCKS_PERTHREAD=1";
|
||||
return;
|
||||
}
|
||||
// Check if configuration is set to trigger the excessive threads
|
||||
// warning Use NCCL_NET_MAX_REQUESTS instead of arbitrary
|
||||
// MAX_THREADS
|
||||
if (nThreads <= NCCL_NET_MAX_REQUESTS) {
|
||||
GTEST_SKIP()
|
||||
<< "SKIPPING TEST: NCCL_SOCKET_NTHREADS must be > "
|
||||
<< NCCL_NET_MAX_REQUESTS
|
||||
<< " to test excessive thread warning. "
|
||||
<< "Current NCCL_SOCKET_NTHREADS=" << nThreads << ". "
|
||||
<< "Please set: export NCCL_SOCKET_NTHREADS="
|
||||
<< (NCCL_NET_MAX_REQUESTS + 1) << ". "
|
||||
<< "NCCL_SOCKET_NTHREADS must be > NCCL_NET_MAX_REQUESTS ("
|
||||
<< NCCL_NET_MAX_REQUESTS << ") to trigger warning";
|
||||
return;
|
||||
}
|
||||
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"Configuration valid for testing excessive threads warning");
|
||||
INFO(NCCL_LOG_INFO, "NCCL_SOCKET_NTHREADS=%d > NCCL_NET_MAX_REQUESTS=%d", nThreads, NCCL_NET_MAX_REQUESTS);
|
||||
if (totalSockets >
|
||||
NCCL_NET_MAX_REQUESTS *
|
||||
10) { // Allow 10x for testing excessive config
|
||||
GTEST_SKIP() << "SKIPPING TEST: Total sockets=" << totalSockets
|
||||
<< " is unreasonably large (> "
|
||||
<< (NCCL_NET_MAX_REQUESTS * 10) << "). "
|
||||
<< "Please use more reasonable values for testing. "
|
||||
"NCCL_NET_MAX_REQUESTS="
|
||||
<< NCCL_NET_MAX_REQUESTS << ". "
|
||||
<< "Example: export NCCL_SOCKET_NTHREADS="
|
||||
<< (NCCL_NET_MAX_REQUESTS + 1)
|
||||
<< " && export NCCL_NSOCKS_PERTHREAD=1";
|
||||
return;
|
||||
}
|
||||
|
||||
// Test socket properties
|
||||
TestSocketProperties();
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"Configuration valid for testing excessive threads warning");
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"NCCL_SOCKET_NTHREADS=%d > NCCL_NET_MAX_REQUESTS=%d", nThreads,
|
||||
NCCL_NET_MAX_REQUESTS);
|
||||
|
||||
// Initialize to trigger the warning logic
|
||||
char handle[NCCL_NET_HANDLE_MAXSIZE];
|
||||
void *listenComm = nullptr;
|
||||
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
|
||||
// Test socket properties
|
||||
TestSocketProperties();
|
||||
|
||||
if (result == ncclSuccess && listenComm) {
|
||||
// The implementation should have limited the threads to NCCL_NET_MAX_REQUESTS
|
||||
// internally
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"*** SUCCESS: Listen succeeded with excessive NCCL_SOCKET_NTHREADS - "
|
||||
"limits enforced internally ***");
|
||||
ncclNetSocket.closeListen(listenComm);
|
||||
} else {
|
||||
INFO(NCCL_LOG_INFO, "Listen failed with result: %d", result);
|
||||
}
|
||||
// Initialize to trigger the warning logic
|
||||
char handle[NCCL_NET_HANDLE_MAXSIZE];
|
||||
void *listenComm = nullptr;
|
||||
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
|
||||
|
||||
INFO(NCCL_LOG_INFO, "TestExcessiveThreadConfig completed");
|
||||
if (result == ncclSuccess && listenComm) {
|
||||
// The implementation should have limited the threads to
|
||||
// NCCL_NET_MAX_REQUESTS internally
|
||||
INFO(NCCL_LOG_INFO, "*** SUCCESS: Listen succeeded with "
|
||||
"excessive NCCL_SOCKET_NTHREADS - "
|
||||
"limits enforced internally ***");
|
||||
ncclNetSocket.closeListen(listenComm);
|
||||
} else {
|
||||
INFO(NCCL_LOG_INFO, "Listen failed with result: %d", result);
|
||||
}
|
||||
|
||||
INFO(NCCL_LOG_INFO, "TestExcessiveThreadConfig completed");
|
||||
})
|
||||
.withEnvironment({{"NCCL_SOCKET_NTHREADS", "33"},
|
||||
{"NCCL_NSOCKS_PERTHREAD", "1"},
|
||||
{"NCCL_DEBUG", "TRACE"},
|
||||
{"NCCL_DEBUG_SUBSYS", "ALL"}})
|
||||
);
|
||||
}
|
||||
|
||||
// Test for excessive socket configuration warning
|
||||
TEST_F(NetSocketTests, TestExcessiveSocketConfig) {
|
||||
INFO(NCCL_LOG_INFO, "Testing excessive socket configuration warning");
|
||||
ProcessIsolatedTestRunner::ExecutionOptions options;
|
||||
options.stopOnFirstFailure = false; // Continue running all tests
|
||||
options.verboseLogging = true;
|
||||
|
||||
// Check if the required environment variables are set
|
||||
const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS");
|
||||
const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD");
|
||||
RUN_ISOLATED_TESTS_WITH_OPTIONS(options,
|
||||
ProcessIsolatedTestRunner::TestConfig(
|
||||
"TestExcessiveThreadConfig",
|
||||
[this]() {
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"Testing excessive socket configuration warning");
|
||||
|
||||
if (!nThreadsEnv || !nSocksPerThreadEnv) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Required environment variables not set. "
|
||||
<< "This test requires total sockets (nThreads * nSocksPerThread) > MAX_SOCKETS (64). "
|
||||
<< "Environment variables NCCL_SOCKET_NTHREADS and NCCL_NSOCKS_PERTHREAD must be set";
|
||||
return;
|
||||
}
|
||||
// Check if the required environment variables are set
|
||||
const char *nThreadsEnv = getenv("NCCL_SOCKET_NTHREADS");
|
||||
const char *nSocksPerThreadEnv = getenv("NCCL_NSOCKS_PERTHREAD");
|
||||
|
||||
// Parse with validation - both must be positive
|
||||
int nThreads = ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1);
|
||||
int nSocksPerThread = ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1);
|
||||
if (!nThreadsEnv || !nSocksPerThreadEnv) {
|
||||
GTEST_SKIP()
|
||||
<< "SKIPPING TEST: Required environment variables not set. "
|
||||
<< "This test requires total sockets (nThreads * "
|
||||
"nSocksPerThread) > MAX_SOCKETS (64). "
|
||||
<< "Environment variables NCCL_SOCKET_NTHREADS and "
|
||||
"NCCL_NSOCKS_PERTHREAD must be set";
|
||||
return;
|
||||
}
|
||||
|
||||
// Check for potential overflow before multiplication
|
||||
if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Configuration would cause integer overflow. "
|
||||
<< "NCCL_SOCKET_NTHREADS=" << nThreads << " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread
|
||||
<< " exceeds maximum integer value. Please use smaller values.";
|
||||
return;
|
||||
}
|
||||
// Parse with validation - both must be positive
|
||||
int nThreads =
|
||||
ParseEnvVar(nThreadsEnv, "NCCL_SOCKET_NTHREADS", 0, 1);
|
||||
int nSocksPerThread =
|
||||
ParseEnvVar(nSocksPerThreadEnv, "NCCL_NSOCKS_PERTHREAD", 0, 1);
|
||||
|
||||
int totalSockets = nThreads * nSocksPerThread;
|
||||
// Check for potential overflow before multiplication
|
||||
if (nThreads > 0 && nSocksPerThread > INT_MAX / nThreads) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Configuration would cause "
|
||||
"integer overflow. "
|
||||
<< "NCCL_SOCKET_NTHREADS=" << nThreads
|
||||
<< " * NCCL_NSOCKS_PERTHREAD=" << nSocksPerThread
|
||||
<< " exceeds maximum integer value. Please use "
|
||||
"smaller values.";
|
||||
return;
|
||||
}
|
||||
|
||||
INFO(NCCL_LOG_INFO, "Environment configuration found:");
|
||||
INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads);
|
||||
INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread);
|
||||
INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets);
|
||||
int totalSockets = nThreads * nSocksPerThread;
|
||||
|
||||
// Check if configuration is set to trigger the excessive sockets warning
|
||||
const int MAX_SOCKETS = 64;
|
||||
if (totalSockets <= MAX_SOCKETS) {
|
||||
GTEST_SKIP() << "SKIPPING TEST: Total sockets must be > " << MAX_SOCKETS << " to test excessive socket warning. "
|
||||
<< "Current total sockets=" << totalSockets
|
||||
<< " (nThreads=" << nThreads << " * nSocksPerThread=" << nSocksPerThread << "). "
|
||||
<< "Please set environment variables such that total > " << MAX_SOCKETS << ", e.g.: "
|
||||
<< "export NCCL_SOCKET_NTHREADS=9 && export NCCL_NSOCKS_PERTHREAD=8. "
|
||||
<< "Total sockets must be > MAX_SOCKETS (" << MAX_SOCKETS << ") to trigger warning";
|
||||
return;
|
||||
}
|
||||
INFO(NCCL_LOG_INFO, "Environment configuration found:");
|
||||
INFO(NCCL_LOG_INFO, " NCCL_SOCKET_NTHREADS=%d", nThreads);
|
||||
INFO(NCCL_LOG_INFO, " NCCL_NSOCKS_PERTHREAD=%d", nSocksPerThread);
|
||||
INFO(NCCL_LOG_INFO, " Total sockets=%d", totalSockets);
|
||||
|
||||
// Additional validation against NCCL_NET_MAX_REQUESTS for reasonable upper bounds
|
||||
if (totalSockets > NCCL_NET_MAX_REQUESTS * 10) { // Allow 10x for testing excessive config
|
||||
GTEST_SKIP() << "SKIPPING TEST: Total sockets=" << totalSockets << " is unreasonably large (> " << (NCCL_NET_MAX_REQUESTS * 10) << "). "
|
||||
<< "Please use more reasonable values for testing. NCCL_NET_MAX_REQUESTS=" << NCCL_NET_MAX_REQUESTS << ". "
|
||||
<< "Example: export NCCL_SOCKET_NTHREADS=10 && export NCCL_NSOCKS_PERTHREAD=10";
|
||||
return;
|
||||
}
|
||||
// Check if configuration is set to trigger the excessive sockets
|
||||
// warning
|
||||
const int MAX_SOCKETS = 64;
|
||||
if (totalSockets <= MAX_SOCKETS) {
|
||||
GTEST_SKIP()
|
||||
<< "SKIPPING TEST: Total sockets must be > " << MAX_SOCKETS
|
||||
<< " to test excessive socket warning. "
|
||||
<< "Current total sockets=" << totalSockets
|
||||
<< " (nThreads=" << nThreads
|
||||
<< " * nSocksPerThread=" << nSocksPerThread << "). "
|
||||
<< "Please set environment variables such that total > "
|
||||
<< MAX_SOCKETS << ", e.g.: "
|
||||
<< "export NCCL_SOCKET_NTHREADS=9 && export "
|
||||
"NCCL_NSOCKS_PERTHREAD=8. "
|
||||
<< "Total sockets must be > MAX_SOCKETS (" << MAX_SOCKETS
|
||||
<< ") to trigger warning";
|
||||
return;
|
||||
}
|
||||
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"Configuration valid for testing excessive sockets warning");
|
||||
INFO(NCCL_LOG_INFO, "Total sockets=%d > MAX_SOCKETS=64", totalSockets);
|
||||
// Additional validation against NCCL_NET_MAX_REQUESTS for
|
||||
// reasonable upper bounds
|
||||
if (totalSockets >
|
||||
NCCL_NET_MAX_REQUESTS *
|
||||
10) { // Allow 10x for testing excessive config
|
||||
GTEST_SKIP() << "SKIPPING TEST: Total sockets=" << totalSockets
|
||||
<< " is unreasonably large (> "
|
||||
<< (NCCL_NET_MAX_REQUESTS * 10) << "). "
|
||||
<< "Please use more reasonable values for testing. "
|
||||
"NCCL_NET_MAX_REQUESTS="
|
||||
<< NCCL_NET_MAX_REQUESTS << ". "
|
||||
<< "Example: export NCCL_SOCKET_NTHREADS=10 && "
|
||||
"export NCCL_NSOCKS_PERTHREAD=10";
|
||||
return;
|
||||
}
|
||||
|
||||
// Test socket properties
|
||||
TestSocketProperties();
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"Configuration valid for testing excessive sockets warning");
|
||||
INFO(NCCL_LOG_INFO, "Total sockets=%d > MAX_SOCKETS=64",
|
||||
totalSockets);
|
||||
|
||||
// Initialize to trigger the warning logic
|
||||
char handle[NCCL_NET_HANDLE_MAXSIZE];
|
||||
void *listenComm = nullptr;
|
||||
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
|
||||
// Test socket properties
|
||||
TestSocketProperties();
|
||||
|
||||
if (result == ncclSuccess && listenComm) {
|
||||
// The implementation should have limited the sockets to MAX_SOCKETS
|
||||
// internally
|
||||
INFO(NCCL_LOG_INFO, "*** SUCCESS: Listen succeeded with excessive total "
|
||||
"sockets - limits enforced internally ***");
|
||||
ncclNetSocket.closeListen(listenComm);
|
||||
} else {
|
||||
INFO(NCCL_LOG_INFO, "Listen failed with result: %d", result);
|
||||
}
|
||||
// Initialize to trigger the warning logic
|
||||
char handle[NCCL_NET_HANDLE_MAXSIZE];
|
||||
void *listenComm = nullptr;
|
||||
ncclResult_t result = ncclNetSocket.listen(0, handle, &listenComm);
|
||||
|
||||
INFO(NCCL_LOG_INFO, "TestExcessiveSocketConfig completed");
|
||||
if (result == ncclSuccess && listenComm) {
|
||||
// The implementation should have limited the sockets to
|
||||
// MAX_SOCKETS internally
|
||||
INFO(NCCL_LOG_INFO,
|
||||
"*** SUCCESS: Listen succeeded with excessive total "
|
||||
"sockets - limits enforced internally ***");
|
||||
ncclNetSocket.closeListen(listenComm);
|
||||
} else {
|
||||
INFO(NCCL_LOG_INFO, "Listen failed with result: %d", result);
|
||||
}
|
||||
|
||||
INFO(NCCL_LOG_INFO, "TestExcessiveSocketConfig completed");
|
||||
})
|
||||
.withEnvironment({{"NCCL_SOCKET_NTHREADS", "10"},
|
||||
{"NCCL_NSOCKS_PERTHREAD", "10"},
|
||||
{"NCCL_DEBUG", "TRACE"},
|
||||
{"NCCL_DEBUG_SUBSYS", "ALL"}})
|
||||
);
|
||||
}
|
||||
|
||||
// Test to trigger request allocation failure scenario
|
||||
|
||||
@@ -3,20 +3,14 @@
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "collectives.h"
|
||||
#include "comm.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "info.h"
|
||||
#include "profiler.h"
|
||||
#include "shmutils.h"
|
||||
#include "socket.h"
|
||||
#define ENABLE_TIMER 0
|
||||
#include "profiler.h"
|
||||
#include "proxy.h"
|
||||
#include "timer.h"
|
||||
#include "transport.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <poll.h>
|
||||
#include <sched.h>
|
||||
@@ -25,409 +19,467 @@
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "common/ErrCode.hpp"
|
||||
#include "common/ProcessIsolatedTestRunner.hpp"
|
||||
#include "profiler.h"
|
||||
#include "proxy.h"
|
||||
#include "timer.h"
|
||||
#include "transport.h"
|
||||
|
||||
#define NCCL_MAX_OPS (2048)
|
||||
#define OP_INDEX(op) ((op) ? (op) - state->pools->elems : -1)
|
||||
#define OP_SEEN 0x100000
|
||||
|
||||
ncclResult_t getOpIndex(struct ncclProxyArgs *op,
|
||||
struct ncclProxyProgressState *state, int *poolIndex,
|
||||
int *opIndex);
|
||||
ncclResult_t dumpProxyState(struct ncclProxyProgressState *state);
|
||||
ncclResult_t printProxyOp(struct ncclProxyArgs *op, int poolIndex, int opIndex);
|
||||
ncclResult_t dumpProxyState(struct ncclProxyProgressState *state);
|
||||
ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm *comm,
|
||||
struct ncclProxyConnector *proxyConn,
|
||||
int type, void *reqBuff, int reqSize,
|
||||
void *respBuff, int respSize, int *reqFd,
|
||||
int *respFd);
|
||||
ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm *comm, int proxyRank,
|
||||
void *handle, int *convertedFd);
|
||||
ncclResult_t
|
||||
ncclProxyClientQueryFdBlocking(struct ncclComm *comm,
|
||||
struct ncclProxyConnector *proxyConn,
|
||||
int localFd, int *rmtFd);
|
||||
ncclResult_t getOpIndex(
|
||||
struct ncclProxyArgs* op, struct ncclProxyProgressState* state, int* poolIndex, int* opIndex
|
||||
);
|
||||
ncclResult_t dumpProxyState(struct ncclProxyProgressState* state);
|
||||
ncclResult_t printProxyOp(struct ncclProxyArgs* op, int poolIndex, int opIndex);
|
||||
ncclResult_t dumpProxyState(struct ncclProxyProgressState* state);
|
||||
ncclResult_t ncclProxyCallBlockingUDS(
|
||||
struct ncclComm* comm,
|
||||
struct ncclProxyConnector* proxyConn,
|
||||
int type,
|
||||
void* reqBuff,
|
||||
int reqSize,
|
||||
void* respBuff,
|
||||
int respSize,
|
||||
int* reqFd,
|
||||
int* respFd
|
||||
);
|
||||
ncclResult_t ncclProxyClientGetFdBlocking(
|
||||
struct ncclComm* comm, int proxyRank, void* handle, int* convertedFd
|
||||
);
|
||||
ncclResult_t ncclProxyClientQueryFdBlocking(
|
||||
struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int localFd, int* rmtFd
|
||||
);
|
||||
|
||||
void ncclDumpProxyState(int signal);
|
||||
|
||||
#define PROXYARGS_ALLOCATE_SIZE NCCL_MAX_OPS
|
||||
struct ncclProxyPool {
|
||||
struct ncclProxyPool *next;
|
||||
struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
|
||||
|
||||
struct ncclProxyPool
|
||||
{
|
||||
struct ncclProxyPool* next;
|
||||
struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
|
||||
};
|
||||
|
||||
void init_ncclProxyArgs_struct(ncclProxyArgs *pool_ptr) {
|
||||
// init pool_ptr
|
||||
pool_ptr->send = 2;
|
||||
pool_ptr->nextRank = 4;
|
||||
pool_ptr->prevRank = 5;
|
||||
pool_ptr->pattern = ncclPatternRing;
|
||||
pool_ptr->nsubs = 1;
|
||||
pool_ptr->state = ncclProxyOpNone;
|
||||
pool_ptr->retry_total = 2;
|
||||
void init_ncclProxyArgs_struct(ncclProxyArgs* pool_ptr)
|
||||
{
|
||||
// init pool_ptr
|
||||
pool_ptr->send = 2;
|
||||
pool_ptr->nextRank = 4;
|
||||
pool_ptr->prevRank = 5;
|
||||
pool_ptr->pattern = ncclPatternRing;
|
||||
pool_ptr->nsubs = 1;
|
||||
pool_ptr->state = ncclProxyOpNone;
|
||||
pool_ptr->retry_total = 2;
|
||||
}
|
||||
namespace RcclUnitTesting {
|
||||
TEST(ProxyTests,
|
||||
getOpIndex) { // Tests what is the index of the pool being passed within
|
||||
// the known valid pools in state ptr
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n");
|
||||
|
||||
// Init Dummy structs
|
||||
struct ncclProxyArgs *pool_ptr = new ncclProxyArgs;
|
||||
struct ncclProxyPool *pools_ptr = new ncclProxyPool;
|
||||
struct ncclProxyPool *pools2_ptr = new ncclProxyPool;
|
||||
struct ncclProxyProgressState *state_ptr = new ncclProxyProgressState;
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(ProxyTests, getOpIndex)
|
||||
{ // Tests what is the index of the pool being passed within
|
||||
// the known valid pools in state ptr
|
||||
INFO("[ProxyTests] Test Start \n");
|
||||
|
||||
// state_ptr = &state;
|
||||
state_ptr->active = &pools_ptr->elems[1]; // chk
|
||||
state_ptr->pool = pool_ptr;
|
||||
state_ptr->pools = pools_ptr;
|
||||
// Init Dummy structs
|
||||
struct ncclProxyArgs* pool_ptr = new ncclProxyArgs;
|
||||
struct ncclProxyPool* pools_ptr = new ncclProxyPool;
|
||||
struct ncclProxyPool* pools2_ptr = new ncclProxyPool;
|
||||
struct ncclProxyProgressState* state_ptr = new ncclProxyProgressState;
|
||||
|
||||
pools_ptr->next = pools2_ptr;
|
||||
// state_ptr = &state;
|
||||
state_ptr->active = &pools_ptr->elems[1]; // chk
|
||||
state_ptr->pool = pool_ptr;
|
||||
state_ptr->pools = pools_ptr;
|
||||
|
||||
struct ncclProxyArgs *x =
|
||||
&pools_ptr->elems[5]; // Passing the 5th element of the pool
|
||||
struct ncclProxyProgressState *y = state_ptr;
|
||||
y->pools->next = y->pools; // next points to self
|
||||
pools_ptr->next = pools2_ptr;
|
||||
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] x=%u y->pools=%u x-y=%u \n", x,
|
||||
y->pools->elems, x - y->pools->elems);
|
||||
struct ncclProxyArgs* x = &pools_ptr->elems[5]; // Passing the 5th element of the pool
|
||||
struct ncclProxyProgressState* y = state_ptr;
|
||||
y->pools->next = y->pools; // next points to self
|
||||
|
||||
int pool_idx, opIndex;
|
||||
ncclResult_t res = getOpIndex(x, y, &pool_idx, &opIndex);
|
||||
INFO(
|
||||
"[ProxyTests] x=%p y->pools=%p x-y=%ld \n",
|
||||
(void*)x,
|
||||
(void*)y->pools->elems,
|
||||
x - y->pools->elems
|
||||
);
|
||||
|
||||
ASSERT_EQ(pool_idx, 0);
|
||||
ASSERT_EQ(opIndex, 5);
|
||||
int pool_idx, opIndex;
|
||||
ncclResult_t res = getOpIndex(x, y, &pool_idx, &opIndex);
|
||||
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] pool_idx %d opIndex %d \n", pool_idx,
|
||||
opIndex);
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", res);
|
||||
assert(res == ncclSuccess);
|
||||
ASSERT_EQ(pool_idx, 0);
|
||||
ASSERT_EQ(opIndex, 5);
|
||||
|
||||
INFO("[ProxyTests] pool_idx %d opIndex %d \n", pool_idx, opIndex);
|
||||
INFO("[ProxyTests] res %u \n", res);
|
||||
assert(res == ncclSuccess);
|
||||
|
||||
delete pool_ptr;
|
||||
delete pools_ptr;
|
||||
delete pools2_ptr;
|
||||
delete state_ptr;
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n");
|
||||
INFO("[ProxyTests] Test Complete \n");
|
||||
}
|
||||
|
||||
TEST(ProxyTests, printProxyOp) {
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n");
|
||||
// Init Dummy structs
|
||||
TEST(ProxyTests, printProxyOp)
|
||||
{
|
||||
INFO("[ProxyTests] Test Start \n");
|
||||
// Init Dummy structs
|
||||
|
||||
struct ncclProxyArgs *pool_ptr = new ncclProxyArgs;
|
||||
struct ncclProxyArgs* pool_ptr = new ncclProxyArgs;
|
||||
|
||||
struct ncclProxyPool *pools_ptr = new ncclProxyPool;
|
||||
struct ncclProxyPool *pools2_ptr = new ncclProxyPool;
|
||||
struct ncclProxyPool* pools_ptr = new ncclProxyPool;
|
||||
struct ncclProxyPool* pools2_ptr = new ncclProxyPool;
|
||||
|
||||
struct ncclProxyProgressState *state_ptr = new ncclProxyProgressState;
|
||||
struct ncclProxyProgressState* state_ptr = new ncclProxyProgressState;
|
||||
|
||||
// state_ptr = &state;
|
||||
state_ptr->active = &pools_ptr->elems[1]; // chk
|
||||
state_ptr->pool = pool_ptr;
|
||||
state_ptr->pools = pools_ptr;
|
||||
// state_ptr = &state;
|
||||
state_ptr->active = &pools_ptr->elems[1]; // chk
|
||||
state_ptr->pool = pool_ptr;
|
||||
state_ptr->pools = pools_ptr;
|
||||
|
||||
pools_ptr->next = pools2_ptr;
|
||||
pools_ptr->next = pools2_ptr;
|
||||
|
||||
struct ncclProxyArgs *x = &pools_ptr->elems[5];
|
||||
struct ncclProxyProgressState *y = state_ptr;
|
||||
y->pools->next = y->pools; // next points to self
|
||||
struct ncclProxyArgs* x = &pools_ptr->elems[5];
|
||||
struct ncclProxyProgressState* y = state_ptr;
|
||||
y->pools->next = y->pools; // next points to self
|
||||
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] x=%u y->pools=%u x-y=%u \n", x,
|
||||
y->pools->elems, x - y->pools->elems);
|
||||
INFO(
|
||||
"[ProxyTests] x=%p y->pools=%p x-y=%ld \n",
|
||||
(void*)x,
|
||||
(void*)y->pools->elems,
|
||||
x - y->pools->elems
|
||||
);
|
||||
|
||||
init_ncclProxyArgs_struct(pool_ptr);
|
||||
init_ncclProxyArgs_struct(pool_ptr);
|
||||
|
||||
int pool_idx = 2, opIndex = 3; // random vals
|
||||
ncclResult_t res = printProxyOp(pool_ptr, pool_idx, opIndex);
|
||||
int pool_idx = 2, opIndex = 3; // random vals
|
||||
ncclResult_t res = printProxyOp(pool_ptr, pool_idx, opIndex);
|
||||
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", res);
|
||||
assert(res == ncclSuccess);
|
||||
INFO("[ProxyTests] res %u \n", res);
|
||||
assert(res == ncclSuccess);
|
||||
|
||||
delete pools_ptr;
|
||||
delete pools2_ptr;
|
||||
delete pool_ptr;
|
||||
delete state_ptr;
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n");
|
||||
INFO("[ProxyTests] Test Complete \n");
|
||||
}
|
||||
|
||||
TEST(ProxyTests, dumpProxyState) {
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n");
|
||||
TEST(ProxyTests, dumpProxyState)
|
||||
{
|
||||
INFO("[ProxyTests] Test Start \n");
|
||||
|
||||
// Init Dummy structs
|
||||
struct ncclProxyArgs *pool_ptr;
|
||||
struct ncclProxyPool *pools_ptr = new ncclProxyPool;
|
||||
struct ncclProxyPool *pools2_ptr = new ncclProxyPool;
|
||||
// Init Dummy structs
|
||||
struct ncclProxyArgs* pool_ptr;
|
||||
struct ncclProxyPool* pools_ptr = new ncclProxyPool;
|
||||
struct ncclProxyPool* pools2_ptr = new ncclProxyPool;
|
||||
|
||||
struct ncclProxyProgressState *state_ptr = new ncclProxyProgressState;
|
||||
struct ncclProxyProgressState* state_ptr = new ncclProxyProgressState;
|
||||
|
||||
state_ptr->active = &pools_ptr->elems[1];
|
||||
pool_ptr = &pools_ptr->elems[4];
|
||||
pool_ptr->next = NULL;
|
||||
pool_ptr->nextPeer = NULL;
|
||||
state_ptr->active = &pools_ptr->elems[1];
|
||||
pool_ptr = &pools_ptr->elems[4];
|
||||
pool_ptr->next = NULL;
|
||||
pool_ptr->nextPeer = NULL;
|
||||
|
||||
state_ptr->pool = pool_ptr;
|
||||
state_ptr->pool->next = NULL;
|
||||
state_ptr->pool->nextPeer = NULL;
|
||||
state_ptr->pool->state = OP_SEEN;
|
||||
state_ptr->pools = pools_ptr;
|
||||
state_ptr->pools->next = NULL;
|
||||
state_ptr->pool = pool_ptr;
|
||||
state_ptr->pool->next = NULL;
|
||||
state_ptr->pool->nextPeer = NULL;
|
||||
state_ptr->pool->state = OP_SEEN;
|
||||
state_ptr->pools = pools_ptr;
|
||||
state_ptr->pools->next = NULL;
|
||||
|
||||
struct ncclProxyArgs *op = state_ptr->active;
|
||||
op->state = OP_SEEN;
|
||||
op->nextPeer = NULL;
|
||||
op->next = NULL;
|
||||
struct ncclProxyArgs* op = state_ptr->active;
|
||||
op->state = OP_SEEN;
|
||||
op->nextPeer = NULL;
|
||||
op->next = NULL;
|
||||
|
||||
pools_ptr->next = NULL;
|
||||
pools_ptr->next = NULL;
|
||||
|
||||
init_ncclProxyArgs_struct(pool_ptr);
|
||||
init_ncclProxyArgs_struct(pool_ptr);
|
||||
|
||||
int pool_idx = 2, opIndex = 3; // random vals
|
||||
ncclResult_t res = dumpProxyState(state_ptr);
|
||||
int pool_idx = 2, opIndex = 3; // random vals
|
||||
ncclResult_t res = dumpProxyState(state_ptr);
|
||||
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", res);
|
||||
ASSERT_EQ(res, ncclSuccess);
|
||||
INFO("[ProxyTests] res %u \n", res);
|
||||
ASSERT_EQ(res, ncclSuccess);
|
||||
|
||||
delete pools_ptr;
|
||||
|
||||
|
||||
delete pools2_ptr;
|
||||
|
||||
|
||||
|
||||
delete state_ptr;
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n");
|
||||
INFO("[ProxyTests] Test Complete \n");
|
||||
}
|
||||
|
||||
TEST(ProxyTests, ncclProxyCallBlockingUDS) {
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n");
|
||||
TEST(ProxyTests, ncclProxyCallBlockingUDS)
|
||||
{
|
||||
INFO("[ProxyTests] Test Start \n");
|
||||
|
||||
// Init Dummy structs
|
||||
struct ncclComm *comm = new ncclComm;
|
||||
int *arr = new int[100];
|
||||
for (int i = 0; i < 100; i++) {
|
||||
arr[i] = i;
|
||||
}
|
||||
// Init Dummy structs
|
||||
struct ncclComm* comm = new ncclComm;
|
||||
int* arr = new int[100];
|
||||
for(int i = 0; i < 100; i++)
|
||||
{
|
||||
arr[i] = i;
|
||||
}
|
||||
|
||||
comm->topParentLocalRanks = arr;
|
||||
comm->localRank = 10;
|
||||
comm->topParentLocalRanks = arr;
|
||||
comm->localRank = 10;
|
||||
|
||||
int *arr_x = new int[20];
|
||||
for (int i = 0; i < 20; i++) {
|
||||
arr_x[i] = i;
|
||||
}
|
||||
comm->topParentRanks = arr_x;
|
||||
int* arr_x = new int[20];
|
||||
for(int i = 0; i < 20; i++)
|
||||
{
|
||||
arr_x[i] = i;
|
||||
}
|
||||
comm->topParentRanks = arr_x;
|
||||
|
||||
struct ncclProxyState *sharedProxyState = new ncclProxyState;
|
||||
uint64_t *arr2 = new uint64_t[10];
|
||||
for (int i = 0; i < 10; i++) {
|
||||
arr2[i] = 122567 + i; // random
|
||||
}
|
||||
struct ncclProxyState* sharedProxyState = new ncclProxyState;
|
||||
uint64_t* arr2 = new uint64_t[10];
|
||||
for(int i = 0; i < 10; i++)
|
||||
{
|
||||
arr2[i] = 122567 + i; // random
|
||||
}
|
||||
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] sizeof(ncclProxyConnector) = %u\n",
|
||||
sizeof(ncclProxyConnector));
|
||||
struct ncclProxyConnector *proxyConn =
|
||||
new (std::nothrow) ncclProxyConnector[20];
|
||||
if (proxyConn == nullptr) {
|
||||
// Handle allocation failure
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] Allocation failed\n");
|
||||
ASSERT_NE(proxyConn, nullptr);
|
||||
}
|
||||
INFO("[ProxyTests] sizeof(ncclProxyConnector) = %zu\n", sizeof(ncclProxyConnector));
|
||||
struct ncclProxyConnector* proxyConn = new(std::nothrow) ncclProxyConnector[20];
|
||||
if(proxyConn == nullptr)
|
||||
{
|
||||
// Handle allocation failure
|
||||
INFO("[ProxyTests] Allocation failed\n");
|
||||
ASSERT_NE(proxyConn, nullptr);
|
||||
}
|
||||
|
||||
proxyConn->tpRank = 2;
|
||||
proxyConn->tpRank = 2;
|
||||
|
||||
comm->proxyState = sharedProxyState;
|
||||
comm->proxyState = sharedProxyState;
|
||||
|
||||
comm->proxyState->peerAddressesUDS = arr2;
|
||||
comm->proxyState->peerAddressesUDS = arr2;
|
||||
|
||||
comm->abortFlag = NULL;
|
||||
comm->abortFlag = NULL;
|
||||
|
||||
int rank = comm->topParentLocalRanks[comm->localRank];
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] rank %d\n", rank);
|
||||
uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank];
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] pidHash %u \n", pidHash);
|
||||
int rank = comm->topParentLocalRanks[comm->localRank];
|
||||
INFO("[ProxyTests] rank %d\n", rank);
|
||||
uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank];
|
||||
INFO("[ProxyTests] pidHash %lu \n", pidHash);
|
||||
|
||||
int type = ncclProxyMsgGetFd;
|
||||
// some memory on stack for storing request and response buffers
|
||||
uint64_t *x_mem = new uint64_t[10];
|
||||
uint64_t *x_mem2 = new uint64_t[10];
|
||||
void *reqBuff = (void *)x_mem;
|
||||
int reqSize = sizeof(uint64_t) * 5;
|
||||
void *respBuff = NULL;
|
||||
int respSize = 0;
|
||||
int *reqFd = NULL;
|
||||
int *respFd = (int *)x_mem2;
|
||||
int type = ncclProxyMsgGetFd;
|
||||
// some memory on stack for storing request and response buffers
|
||||
uint64_t* x_mem = new uint64_t[10];
|
||||
uint64_t* x_mem2 = new uint64_t[10];
|
||||
void* reqBuff = (void*)x_mem;
|
||||
int reqSize = sizeof(uint64_t) * 5;
|
||||
void* respBuff = NULL;
|
||||
int respSize = 0;
|
||||
int* reqFd = NULL;
|
||||
int* respFd = (int*)x_mem2;
|
||||
|
||||
ncclResult_t res =
|
||||
ncclProxyCallBlockingUDS(comm, proxyConn, type, reqBuff, reqSize,
|
||||
respBuff, respSize, reqFd, respFd);
|
||||
ncclResult_t res = ncclProxyCallBlockingUDS(
|
||||
comm,
|
||||
proxyConn,
|
||||
type,
|
||||
reqBuff,
|
||||
reqSize,
|
||||
respBuff,
|
||||
respSize,
|
||||
reqFd,
|
||||
respFd
|
||||
);
|
||||
|
||||
bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError);
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", bool_res);
|
||||
ASSERT_EQ(bool_res, true);
|
||||
bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError);
|
||||
INFO("[ProxyTests] res %u \n", bool_res);
|
||||
ASSERT_EQ(bool_res, true);
|
||||
delete comm;
|
||||
delete sharedProxyState;
|
||||
delete proxyConn;
|
||||
delete[] proxyConn;
|
||||
delete[] arr_x;
|
||||
delete[] arr;
|
||||
delete[] arr2;
|
||||
delete[] x_mem;
|
||||
delete[] x_mem2;
|
||||
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n");
|
||||
INFO("[ProxyTests] Test Complete \n");
|
||||
}
|
||||
|
||||
TEST(ProxyTests, ncclProxyClientGetFdBlocking) {
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n");
|
||||
TEST(ProxyTests, ncclProxyClientGetFdBlocking)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"ncclProxyClientGetFdBlocking",
|
||||
[]()
|
||||
{
|
||||
INFO("[ProxyTests] Test Start \n");
|
||||
|
||||
// Init Dummy structs
|
||||
struct ncclComm *comm = new ncclComm;
|
||||
int *arr = new int[100];
|
||||
for (int i = 0; i < 100; i++) {
|
||||
arr[i] = i;
|
||||
}
|
||||
// Init Dummy structs
|
||||
struct ncclComm* comm = new ncclComm;
|
||||
int* arr = new int[100];
|
||||
for(int i = 0; i < 100; i++)
|
||||
{
|
||||
arr[i] = i;
|
||||
}
|
||||
|
||||
comm->topParentLocalRanks = arr;
|
||||
comm->localRank = 10;
|
||||
struct ncclProxyState *sharedProxyState = new ncclProxyState;
|
||||
comm->topParentLocalRanks = arr;
|
||||
comm->localRank = 10;
|
||||
struct ncclProxyState* sharedProxyState = new ncclProxyState;
|
||||
|
||||
int *arr_x = new int[20];
|
||||
for (int i = 0; i < 20; i++) {
|
||||
arr_x[i] = i;
|
||||
}
|
||||
comm->topParentRanks = arr_x;
|
||||
int* arr_x = new int[20];
|
||||
for(int i = 0; i < 20; i++)
|
||||
{
|
||||
arr_x[i] = i;
|
||||
}
|
||||
comm->topParentRanks = arr_x;
|
||||
|
||||
uint64_t *arr2 = new uint64_t[10];
|
||||
for (int i = 0; i < 10; i++) {
|
||||
arr2[i] = 122567 + i; // random
|
||||
}
|
||||
uint64_t* arr2 = new uint64_t[10];
|
||||
for(int i = 0; i < 10; i++)
|
||||
{
|
||||
arr2[i] = 122567 + i; // random
|
||||
}
|
||||
|
||||
struct ncclProxyConnector *proxyConn =
|
||||
new (std::nothrow) ncclProxyConnector[20];
|
||||
if (proxyConn == nullptr) {
|
||||
// Handle allocation failure
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] Allocation failed\n");
|
||||
ASSERT_NE(proxyConn, nullptr);
|
||||
}
|
||||
struct ncclProxyConnector* proxyConn = new(std::nothrow) ncclProxyConnector[20];
|
||||
if(proxyConn == nullptr)
|
||||
{
|
||||
// Handle allocation failure
|
||||
INFO("[ProxyTests] Allocation failed\n");
|
||||
ASSERT_NE(proxyConn, nullptr);
|
||||
}
|
||||
|
||||
proxyConn->tpRank = 2;
|
||||
comm->proxyState = sharedProxyState;
|
||||
comm->proxyState->peerAddressesUDS = arr2;
|
||||
comm->abortFlag = NULL;
|
||||
proxyConn->tpRank = 2;
|
||||
comm->proxyState = sharedProxyState;
|
||||
comm->proxyState->peerAddressesUDS = arr2;
|
||||
comm->abortFlag = NULL;
|
||||
|
||||
int rank = comm->topParentLocalRanks[comm->localRank];
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] rank %d\n", rank);
|
||||
uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank];
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] pidHash %u \n", pidHash);
|
||||
int rank = comm->topParentLocalRanks[comm->localRank];
|
||||
INFO("[ProxyTests] rank %d\n", rank);
|
||||
uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank];
|
||||
INFO("[ProxyTests] pidHash %lu \n", pidHash);
|
||||
|
||||
int type = ncclProxyMsgGetFd;
|
||||
// some memory on stack for storing request and response buffers
|
||||
uint64_t *x_mem = new uint64_t[10];
|
||||
uint64_t *x_mem2 = new uint64_t[10];
|
||||
void *reqBuff = (void *)x_mem;
|
||||
int reqSize = sizeof(uint64_t) * 5;
|
||||
void *respBuff = NULL;
|
||||
int respSize = 0;
|
||||
int *reqFd = NULL;
|
||||
int *respFd = (int *)x_mem2;
|
||||
int type = ncclProxyMsgGetFd;
|
||||
// some memory on stack for storing request and response buffers
|
||||
uint64_t* x_mem = new uint64_t[10];
|
||||
uint64_t* x_mem2 = new uint64_t[10];
|
||||
void* reqBuff = (void*)x_mem;
|
||||
int reqSize = sizeof(uint64_t) * 5;
|
||||
void* respBuff = NULL;
|
||||
int respSize = 0;
|
||||
int* reqFd = NULL;
|
||||
int* respFd = (int*)x_mem2;
|
||||
|
||||
comm->gproxyConn = proxyConn;
|
||||
comm->gproxyConn[rank].initialized = true;
|
||||
comm->gproxyConn = proxyConn;
|
||||
comm->gproxyConn[rank].initialized = true;
|
||||
|
||||
ncclResult_t res = ncclProxyClientGetFdBlocking(comm, rank, reqBuff, respFd);
|
||||
ncclResult_t res = ncclProxyClientGetFdBlocking(comm, rank, reqBuff, respFd);
|
||||
|
||||
bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError);
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", bool_res);
|
||||
ASSERT_EQ(bool_res, true);
|
||||
bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError);
|
||||
INFO("[ProxyTests] res %u \n", bool_res);
|
||||
ASSERT_EQ(bool_res, true);
|
||||
|
||||
delete comm;
|
||||
delete sharedProxyState;
|
||||
delete proxyConn;
|
||||
delete[] arr_x;
|
||||
delete[] arr;
|
||||
delete[] arr2;
|
||||
delete[] x_mem;
|
||||
delete[] x_mem2;
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n");
|
||||
delete comm;
|
||||
delete sharedProxyState;
|
||||
delete[] proxyConn;
|
||||
delete[] arr_x;
|
||||
delete[] arr;
|
||||
delete[] arr2;
|
||||
delete[] x_mem;
|
||||
delete[] x_mem2;
|
||||
INFO("[ProxyTests] Test Complete \n");
|
||||
INFO("Test 'ncclProxyClientGetFdBlocking' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
TEST(ProxyTests, ncclProxyClientQueryFdBlocking) {
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Start \n");
|
||||
TEST(ProxyTests, ncclProxyClientQueryFdBlocking)
|
||||
{
|
||||
RUN_ISOLATED_TEST(
|
||||
"ncclProxyClientQueryFdBlocking",
|
||||
[]()
|
||||
{
|
||||
INFO("[ProxyTests] Test Start \n");
|
||||
|
||||
// Init Dummy structs
|
||||
struct ncclComm *comm = new ncclComm;
|
||||
int *arr = new int[100];
|
||||
for (int i = 0; i < 5; i++) {
|
||||
arr[i] = i;
|
||||
}
|
||||
// Init Dummy structs
|
||||
struct ncclComm* comm = new ncclComm;
|
||||
int* arr = new int[100];
|
||||
for(int i = 0; i < 5; i++)
|
||||
{
|
||||
arr[i] = i;
|
||||
}
|
||||
|
||||
comm->topParentLocalRanks = arr;
|
||||
comm->localRank = 0;
|
||||
comm->topParentLocalRanks = arr;
|
||||
comm->localRank = 0;
|
||||
|
||||
int *arr_x = new int[20];
|
||||
for (int i = 0; i < 20; i++) {
|
||||
arr_x[i] = i;
|
||||
}
|
||||
comm->topParentRanks = arr_x;
|
||||
int* arr_x = new int[20];
|
||||
for(int i = 0; i < 20; i++)
|
||||
{
|
||||
arr_x[i] = i;
|
||||
}
|
||||
comm->topParentRanks = arr_x;
|
||||
|
||||
struct ncclProxyState *sharedProxyState = new ncclProxyState;
|
||||
struct ncclProxyState* sharedProxyState = new ncclProxyState;
|
||||
|
||||
uint64_t *arr2 = new uint64_t[10];
|
||||
for (int i = 0; i < 10; i++) {
|
||||
arr2[i] = 122567 + i; // random
|
||||
}
|
||||
uint64_t* arr2 = new uint64_t[10];
|
||||
for(int i = 0; i < 10; i++)
|
||||
{
|
||||
arr2[i] = 122567 + i; // random
|
||||
}
|
||||
|
||||
struct ncclProxyConnector *proxyConn =
|
||||
new (std::nothrow) ncclProxyConnector[20];
|
||||
if (proxyConn == nullptr) {
|
||||
// Handle allocation failure
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] Allocation failed\n");
|
||||
ASSERT_NE(proxyConn, nullptr);
|
||||
}
|
||||
struct ncclProxyConnector* proxyConn = new(std::nothrow) ncclProxyConnector[20];
|
||||
if(proxyConn == nullptr)
|
||||
{
|
||||
// Handle allocation failure
|
||||
INFO("[ProxyTests] Allocation failed\n");
|
||||
ASSERT_NE(proxyConn, nullptr);
|
||||
}
|
||||
|
||||
proxyConn->tpRank = 2;
|
||||
proxyConn->tpRank = 2;
|
||||
|
||||
comm->proxyState = sharedProxyState;
|
||||
comm->proxyState = sharedProxyState;
|
||||
|
||||
comm->proxyState->peerAddressesUDS = arr2;
|
||||
comm->proxyState->peerAddressesUDS = arr2;
|
||||
|
||||
comm->abortFlag = NULL;
|
||||
comm->abortFlag = NULL;
|
||||
|
||||
int rank = comm->topParentLocalRanks[comm->localRank];
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] rank %d\n", rank);
|
||||
uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank];
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] pidHash %u \n", pidHash);
|
||||
int rank = comm->topParentLocalRanks[comm->localRank];
|
||||
INFO("[ProxyTests] rank %d\n", rank);
|
||||
uint64_t pidHash = sharedProxyState->peerAddressesUDS[proxyConn->tpRank];
|
||||
INFO("[ProxyTests] pidHash %lu \n", pidHash);
|
||||
|
||||
int type = ncclProxyMsgGetFd;
|
||||
// some memory on stack for storing request and response buffers
|
||||
uint64_t *x_mem = new uint64_t[10];
|
||||
uint64_t *x_mem2 = new uint64_t[10];
|
||||
void *reqBuff = (void *)x_mem;
|
||||
int reqSize = sizeof(uint64_t) * 5;
|
||||
void *respBuff = NULL;
|
||||
int respSize = 0;
|
||||
int *reqFd = NULL;
|
||||
int *respFd = (int *)x_mem2;
|
||||
int type = ncclProxyMsgGetFd;
|
||||
// some memory on stack for storing request and response buffers
|
||||
uint64_t* x_mem = new uint64_t[10];
|
||||
uint64_t* x_mem2 = new uint64_t[10];
|
||||
void* reqBuff = (void*)x_mem;
|
||||
int reqSize = sizeof(uint64_t) * 5;
|
||||
void* respBuff = NULL;
|
||||
int respSize = 0;
|
||||
int* reqFd = NULL;
|
||||
int* respFd = (int*)x_mem2;
|
||||
|
||||
comm->gproxyConn = proxyConn;
|
||||
comm->gproxyConn[rank].initialized = true;
|
||||
comm->gproxyConn = proxyConn;
|
||||
comm->gproxyConn[rank].initialized = true;
|
||||
|
||||
int localFd = 0;
|
||||
int dummy_int = 20;
|
||||
respBuff = &dummy_int;
|
||||
ncclResult_t res =
|
||||
ncclProxyClientQueryFdBlocking(comm, proxyConn, localFd, (int *)respBuff);
|
||||
int localFd = 0;
|
||||
int dummy_int = 20;
|
||||
respBuff = &dummy_int;
|
||||
ncclResult_t res
|
||||
= ncclProxyClientQueryFdBlocking(comm, proxyConn, localFd, (int*)respBuff);
|
||||
|
||||
bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError);
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] res %u \n", bool_res);
|
||||
ASSERT_EQ(bool_res, true);
|
||||
bool bool_res = (res >= ncclSuccess && res <= ncclRemoteError);
|
||||
INFO("[ProxyTests] res %u \n", bool_res);
|
||||
ASSERT_EQ(bool_res, true);
|
||||
|
||||
delete comm;
|
||||
delete sharedProxyState;
|
||||
delete proxyConn;
|
||||
delete[] arr_x;
|
||||
delete[] arr;
|
||||
delete[] arr2;
|
||||
delete[] x_mem;
|
||||
delete[] x_mem2;
|
||||
INFO(NCCL_LOG_INFO, "[ProxyTests] Test Complete \n");
|
||||
delete comm;
|
||||
delete sharedProxyState;
|
||||
delete[] proxyConn;
|
||||
delete[] arr_x;
|
||||
delete[] arr;
|
||||
delete[] arr2;
|
||||
delete[] x_mem;
|
||||
delete[] x_mem2;
|
||||
INFO("[ProxyTests] Test Complete \n");
|
||||
INFO("Test 'ncclProxyClientQueryFdBlocking' PASSED\n");
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
} // namespace RcclUnitTesting
|
||||
|
||||
@@ -14,7 +14,7 @@ The RCCL test suite provides following frameworks along with the existing rccl-U
|
||||
|
||||
## Testing Frameworks
|
||||
|
||||
Following are two new complementary testing frameworks for different testing needs:
|
||||
Following is a new testing framework for running single node & single process test in isolation:
|
||||
|
||||
### 1. Process Isolated Test Runner
|
||||
Run tests in isolated processes with clean environment settings.
|
||||
|
||||
+1073
-2083
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -0,0 +1,696 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "ProcessIsolatedTestRunner.hpp"
|
||||
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <gtest/gtest.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <thread>
|
||||
|
||||
#include "ErrCode.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
|
||||
// Exit codes for test process results
|
||||
enum RcclTestCode
|
||||
{
|
||||
RCCL_TEST_INVALID = -1,
|
||||
RCCL_TEST_SUCCESS = 0,
|
||||
RCCL_TEST_FAILURE = 1,
|
||||
RCCL_TEST_UNKNOWN_EXCEPTION = 2,
|
||||
RCCL_TEST_TIMEOUT = 3,
|
||||
RCCL_TEST_SKIPPED = 4
|
||||
};
|
||||
|
||||
// Define static members
|
||||
std::mutex ProcessIsolatedTestRunner::testConfigsMutex_;
|
||||
std::vector<ProcessIsolatedTestRunner::TestConfig> ProcessIsolatedTestRunner::testConfigs_;
|
||||
std::mutex ProcessIsolatedTestRunner::resultsMutex_;
|
||||
std::vector<ProcessIsolatedTestRunner::TestResult> ProcessIsolatedTestRunner::testResults_;
|
||||
|
||||
// TestResult implementation
|
||||
ProcessIsolatedTestRunner::TestResult::TestResult()
|
||||
: passed(false), skipped(false), exitCode(-1), processId(-1), duration(0)
|
||||
{}
|
||||
|
||||
// TestConfig implementation
|
||||
ProcessIsolatedTestRunner::TestConfig::TestConfig(
|
||||
const std::string& testName, std::function<void()> logic
|
||||
)
|
||||
: name(testName), testLogic(logic), timeout(30), inheritParentEnv(true)
|
||||
{}
|
||||
|
||||
ProcessIsolatedTestRunner::TestConfig& ProcessIsolatedTestRunner::TestConfig::withEnvironment(
|
||||
const std::unordered_map<std::string, std::string>& env
|
||||
)
|
||||
{
|
||||
environmentVariables = env;
|
||||
return *this;
|
||||
}
|
||||
|
||||
ProcessIsolatedTestRunner::TestConfig&
|
||||
ProcessIsolatedTestRunner::TestConfig::withTimeout(std::chrono::seconds timeoutSeconds)
|
||||
{
|
||||
timeout = timeoutSeconds;
|
||||
return *this;
|
||||
}
|
||||
|
||||
ProcessIsolatedTestRunner::TestConfig&
|
||||
ProcessIsolatedTestRunner::TestConfig::withCleanEnvironment(bool inherit)
|
||||
{
|
||||
inheritParentEnv = inherit;
|
||||
return *this;
|
||||
}
|
||||
|
||||
ProcessIsolatedTestRunner::TestConfig&
|
||||
ProcessIsolatedTestRunner::TestConfig::clearVariable(const std::string& varName)
|
||||
{
|
||||
clearEnvVars.push_back(varName);
|
||||
return *this;
|
||||
}
|
||||
|
||||
ProcessIsolatedTestRunner::TestConfig& ProcessIsolatedTestRunner::TestConfig::setVariable(
|
||||
const std::string& name, const std::string& value
|
||||
)
|
||||
{
|
||||
environmentVariables[name] = value;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ExecutionOptions implementation
|
||||
ProcessIsolatedTestRunner::ExecutionOptions::ExecutionOptions()
|
||||
: stopOnFirstFailure(false), verboseLogging(true)
|
||||
{}
|
||||
|
||||
// Apply environment variables to current process
|
||||
void ProcessIsolatedTestRunner::applyEnvironmentVariables(const TestConfig& config)
|
||||
{
|
||||
// Clear specified environment variables first
|
||||
for(const auto& varName : config.clearEnvVars)
|
||||
{
|
||||
unsetenv(varName.c_str());
|
||||
}
|
||||
|
||||
// If not inheriting parent environment, clear all environment variables
|
||||
if(!config.inheritParentEnv)
|
||||
{
|
||||
// Clear all existing environment variables
|
||||
if(clearenv() != 0)
|
||||
{
|
||||
std::cerr << "Warning: Failed to clear environment variables" << std::endl;
|
||||
}
|
||||
|
||||
// Set only the specified variables
|
||||
for(const auto& [name, value] : config.environmentVariables)
|
||||
{
|
||||
setenv(name.c_str(), value.c_str(), 1);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Just set/override the specified variables
|
||||
for(const auto& [name, value] : config.environmentVariables)
|
||||
{
|
||||
setenv(name.c_str(), value.c_str(), 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Execute a single test in a separate process
|
||||
int ProcessIsolatedTestRunner::runTestInProcess(const TestConfig& config)
|
||||
{
|
||||
pid_t processId = getpid();
|
||||
|
||||
if(config.name.empty())
|
||||
{
|
||||
std::cerr << "Error: Test name is empty for process " << processId << std::endl;
|
||||
return RCCL_TEST_FAILURE;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// Apply environment variables
|
||||
applyEnvironmentVariables(config);
|
||||
|
||||
// Thread-safe test execution with timeout protection
|
||||
std::atomic<bool> testCompleted{false};
|
||||
std::exception_ptr testException = nullptr;
|
||||
bool testPassed = true;
|
||||
bool testSkipped = false;
|
||||
|
||||
// Run test in a separate thread to allow timeout handling
|
||||
std::thread testThread(
|
||||
[&]()
|
||||
{
|
||||
try
|
||||
{
|
||||
// Get initial test state
|
||||
const ::testing::UnitTest* unitTest = ::testing::UnitTest::GetInstance();
|
||||
size_t initialFailureCount = unitTest->failed_test_count();
|
||||
size_t initialSkippedCount = unitTest->skipped_test_count();
|
||||
|
||||
// Execute the test logic
|
||||
config.testLogic();
|
||||
|
||||
// Check if any new test failures occurred
|
||||
size_t finalFailureCount = unitTest->failed_test_count();
|
||||
size_t finalSkippedCount = unitTest->skipped_test_count();
|
||||
|
||||
testPassed = (finalFailureCount == initialFailureCount);
|
||||
testSkipped = (finalSkippedCount > initialSkippedCount);
|
||||
|
||||
testCompleted = true;
|
||||
}
|
||||
catch(...)
|
||||
{
|
||||
testException = std::current_exception();
|
||||
testPassed = false;
|
||||
testCompleted = true;
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
// Wait for test completion with timeout
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
const auto timeout = config.timeout;
|
||||
|
||||
while(!testCompleted.load())
|
||||
{
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
if(std::chrono::steady_clock::now() - start > timeout)
|
||||
{
|
||||
// Test timed out
|
||||
INFO(
|
||||
"Test '%s' TIMED OUT after %ld seconds\n",
|
||||
config.name.c_str(),
|
||||
timeout.count()
|
||||
);
|
||||
fflush(NULL);
|
||||
testThread.detach();
|
||||
return RCCL_TEST_TIMEOUT;
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for thread completion
|
||||
if(testThread.joinable())
|
||||
{
|
||||
testThread.join();
|
||||
}
|
||||
|
||||
// Check if test threw an exception
|
||||
if(testException)
|
||||
{
|
||||
std::rethrow_exception(testException);
|
||||
}
|
||||
|
||||
// Flush output before returning (needed before _exit())
|
||||
fflush(NULL);
|
||||
|
||||
// Return appropriate exit code based on test result
|
||||
if(testSkipped)
|
||||
{
|
||||
return RCCL_TEST_SKIPPED;
|
||||
}
|
||||
else if(testPassed)
|
||||
{
|
||||
return RCCL_TEST_SUCCESS;
|
||||
}
|
||||
else
|
||||
{
|
||||
return RCCL_TEST_FAILURE;
|
||||
}
|
||||
}
|
||||
catch(const std::exception& e)
|
||||
{
|
||||
INFO("Test '%s' FAILED with exception: %s\n", config.name.c_str(), e.what());
|
||||
std::cerr << "Exception in test '" << config.name << "': " << e.what() << std::endl;
|
||||
fflush(NULL);
|
||||
return RCCL_TEST_FAILURE;
|
||||
}
|
||||
catch(...)
|
||||
{
|
||||
INFO("Test '%s' FAILED with unknown exception\n", config.name.c_str());
|
||||
std::cerr << "Unknown exception in test '" << config.name << "'" << std::endl;
|
||||
fflush(NULL);
|
||||
return RCCL_TEST_UNKNOWN_EXCEPTION;
|
||||
}
|
||||
}
|
||||
|
||||
// Register a test configuration
|
||||
void ProcessIsolatedTestRunner::registerTest(const TestConfig& config)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(testConfigsMutex_);
|
||||
testConfigs_.push_back(config);
|
||||
}
|
||||
|
||||
// Register a simple test with just name and logic
|
||||
void ProcessIsolatedTestRunner::registerTest(
|
||||
const std::string& name, std::function<void()> testLogic
|
||||
)
|
||||
{
|
||||
registerTest(TestConfig(name, testLogic));
|
||||
}
|
||||
|
||||
// Register a test with environment variables
|
||||
void ProcessIsolatedTestRunner::registerTest(
|
||||
const std::string& name,
|
||||
std::function<void()> testLogic,
|
||||
const std::unordered_map<std::string, std::string>& env
|
||||
)
|
||||
{
|
||||
registerTest(TestConfig(name, testLogic).withEnvironment(env));
|
||||
}
|
||||
|
||||
// Record test result (thread-safe)
|
||||
void ProcessIsolatedTestRunner::recordTestResult(const TestResult& result)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(resultsMutex_);
|
||||
testResults_.push_back(result);
|
||||
}
|
||||
|
||||
// Helper method: Create pipes for capturing process output
|
||||
bool ProcessIsolatedTestRunner::createOutputPipes(int stdoutPipe[2], int stderrPipe[2])
|
||||
{
|
||||
// Create pipes for stdout and stderr
|
||||
// stdoutPipe[0] = read end, stdoutPipe[1] = write end
|
||||
if(pipe(stdoutPipe) == -1)
|
||||
{
|
||||
std::cerr << "Failed to create stdout pipe: " << strerror(errno) << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
if(pipe(stderrPipe) == -1)
|
||||
{
|
||||
std::cerr << "Failed to create stderr pipe: " << strerror(errno) << std::endl;
|
||||
close(stdoutPipe[0]);
|
||||
close(stdoutPipe[1]);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Helper method: Redirect child process output to pipes
|
||||
void ProcessIsolatedTestRunner::redirectOutputToPipes(int stdoutPipe[2], int stderrPipe[2])
|
||||
{
|
||||
// Close read ends of pipes in child process (not needed)
|
||||
close(stdoutPipe[0]);
|
||||
close(stderrPipe[0]);
|
||||
|
||||
// Redirect stdout and stderr to write ends of pipes
|
||||
dup2(stdoutPipe[1], STDOUT_FILENO);
|
||||
dup2(stderrPipe[1], STDERR_FILENO);
|
||||
|
||||
// Close the original write end file descriptors after duplication
|
||||
// The duplicated descriptors (STDOUT_FILENO, STDERR_FILENO) will be closed by _exit()
|
||||
close(stdoutPipe[1]);
|
||||
close(stderrPipe[1]);
|
||||
}
|
||||
|
||||
// Helper method: Capture output from child process pipes
|
||||
ProcessIsolatedTestRunner::CapturedOutput ProcessIsolatedTestRunner::captureProcessOutput(
|
||||
int stdoutPipe[2], int stderrPipe[2], pid_t pid, int* status
|
||||
)
|
||||
{
|
||||
// Close write ends of pipes in parent process (not needed)
|
||||
close(stdoutPipe[1]);
|
||||
close(stderrPipe[1]);
|
||||
|
||||
CapturedOutput output;
|
||||
char buffer[4096];
|
||||
ssize_t count;
|
||||
|
||||
// Read from stdout pipe
|
||||
while((count = read(stdoutPipe[0], buffer, sizeof(buffer) - 1)) > 0)
|
||||
{
|
||||
buffer[count] = '\0';
|
||||
output.stdoutContent += buffer;
|
||||
}
|
||||
close(stdoutPipe[0]);
|
||||
|
||||
// Read from stderr pipe
|
||||
while((count = read(stderrPipe[0], buffer, sizeof(buffer) - 1)) > 0)
|
||||
{
|
||||
buffer[count] = '\0';
|
||||
output.stderrContent += buffer;
|
||||
}
|
||||
close(stderrPipe[0]);
|
||||
|
||||
// Wait for child to exit (blocking)
|
||||
waitpid(pid, status, 0);
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
// Helper method: Display captured output
|
||||
void ProcessIsolatedTestRunner::displayCapturedOutput(
|
||||
const CapturedOutput& output, const std::string& testName
|
||||
)
|
||||
{
|
||||
if(!output.stdoutContent.empty())
|
||||
{
|
||||
std::cout << output.stdoutContent;
|
||||
if(output.stdoutContent.back() != '\n')
|
||||
std::cout << '\n';
|
||||
}
|
||||
if(!output.stderrContent.empty())
|
||||
{
|
||||
std::cerr << output.stderrContent;
|
||||
if(output.stderrContent.back() != '\n')
|
||||
std::cerr << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
// Execute all registered tests (simplified sequential execution only)
|
||||
bool ProcessIsolatedTestRunner::executeAllTests(const ExecutionOptions& options)
|
||||
{
|
||||
|
||||
// Get test configurations to run
|
||||
std::vector<TestConfig> testsToRun;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(testConfigsMutex_);
|
||||
testsToRun = testConfigs_;
|
||||
}
|
||||
|
||||
// Clear previous results
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(resultsMutex_);
|
||||
testResults_.clear();
|
||||
}
|
||||
|
||||
// Sequential execution
|
||||
for(const auto& testConfig : testsToRun)
|
||||
{
|
||||
auto startTime = std::chrono::steady_clock::now();
|
||||
|
||||
int stdout_fd[2], stderr_fd[2];
|
||||
if(!createOutputPipes(stdout_fd, stderr_fd))
|
||||
{
|
||||
std::cerr << "Failed to create output files for test '" << testConfig.name << "'"
|
||||
<< std::endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
pid_t pid = fork();
|
||||
|
||||
if(pid == 0)
|
||||
{
|
||||
redirectOutputToPipes(stdout_fd, stderr_fd);
|
||||
int result = runTestInProcess(testConfig);
|
||||
// Use _exit() instead of exit() to avoid atexit handlers
|
||||
// This prevents GPU runtime cleanup issues after fork
|
||||
_exit(result);
|
||||
}
|
||||
else if(pid > 0)
|
||||
{
|
||||
// Log test start with environment variables if any
|
||||
if(!testConfig.environmentVariables.empty())
|
||||
{
|
||||
std::string envVars;
|
||||
for(const auto& [name, value] : testConfig.environmentVariables)
|
||||
{
|
||||
if(!envVars.empty())
|
||||
envVars += ", ";
|
||||
envVars += name + "=" + value;
|
||||
}
|
||||
INFO(
|
||||
"Running isolated test '%s' (PID: %d) with env: %s\n",
|
||||
testConfig.name.c_str(),
|
||||
pid,
|
||||
envVars.c_str()
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
INFO("Running isolated test '%s' (PID: %d)\n", testConfig.name.c_str(), pid);
|
||||
}
|
||||
int status;
|
||||
CapturedOutput output = captureProcessOutput(stdout_fd, stderr_fd, pid, &status);
|
||||
|
||||
auto endTime = std::chrono::steady_clock::now();
|
||||
auto duration
|
||||
= std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime);
|
||||
|
||||
TestResult testResult;
|
||||
testResult.testName = testConfig.name;
|
||||
testResult.processId = pid;
|
||||
testResult.duration = duration;
|
||||
|
||||
if(WIFEXITED(status))
|
||||
{
|
||||
int exitCode = WEXITSTATUS(status);
|
||||
testResult.exitCode = exitCode;
|
||||
testResult.passed = (exitCode == RCCL_TEST_SUCCESS);
|
||||
testResult.skipped = (exitCode == RCCL_TEST_SKIPPED);
|
||||
|
||||
if(exitCode == RCCL_TEST_SUCCESS)
|
||||
{
|
||||
INFO("Test '%s' PASSED (%ld ms)\n", testConfig.name.c_str(), duration.count());
|
||||
}
|
||||
else if(exitCode == RCCL_TEST_TIMEOUT)
|
||||
{
|
||||
INFO(
|
||||
"Test '%s' (PID: %d) TIMED OUT after %ld ms\n",
|
||||
testConfig.name.c_str(),
|
||||
pid,
|
||||
duration.count()
|
||||
);
|
||||
testResult.errorMessage = "Test timed out";
|
||||
}
|
||||
else if(exitCode == RCCL_TEST_SKIPPED)
|
||||
{
|
||||
INFO(
|
||||
"Test '%s' (PID: %d) SKIPPED in %ld ms\n",
|
||||
testConfig.name.c_str(),
|
||||
pid,
|
||||
duration.count()
|
||||
);
|
||||
testResult.errorMessage = "Test skipped";
|
||||
}
|
||||
else
|
||||
{
|
||||
INFO(
|
||||
"Test '%s' (PID: %d) FAILED with exit code %d after %ld ms\n",
|
||||
testConfig.name.c_str(),
|
||||
pid,
|
||||
exitCode,
|
||||
duration.count()
|
||||
);
|
||||
testResult.errorMessage
|
||||
= "Test failed with exit code " + std::to_string(exitCode);
|
||||
}
|
||||
}
|
||||
else if(WIFSIGNALED(status))
|
||||
{
|
||||
int signal = WTERMSIG(status);
|
||||
|
||||
// Check if test reported success before signal termination
|
||||
bool testPassed = (output.stdoutContent.find("PASSED") != std::string::npos);
|
||||
|
||||
if(testPassed)
|
||||
{
|
||||
// Test completed successfully before signal (e.g., GPU runtime cleanup)
|
||||
testResult.passed = true;
|
||||
testResult.skipped = false;
|
||||
testResult.exitCode = RCCL_TEST_SUCCESS;
|
||||
INFO("Test '%s' PASSED (%ld ms)\n", testConfig.name.c_str(), duration.count());
|
||||
}
|
||||
else
|
||||
{
|
||||
// Test terminated by signal before completion (crash)
|
||||
testResult.passed = false;
|
||||
testResult.skipped = false;
|
||||
testResult.exitCode = -signal;
|
||||
testResult.errorMessage = "Terminated by signal " + std::to_string(signal);
|
||||
INFO(
|
||||
"Test '%s' (PID: %d) terminated by signal %d after %ld ms\n",
|
||||
testConfig.name.c_str(),
|
||||
pid,
|
||||
signal,
|
||||
duration.count()
|
||||
);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
testResult.passed = false;
|
||||
testResult.skipped = false;
|
||||
testResult.exitCode = RCCL_TEST_INVALID;
|
||||
testResult.errorMessage = "Failed to wait for process";
|
||||
}
|
||||
|
||||
displayCapturedOutput(output, testConfig.name);
|
||||
|
||||
recordTestResult(testResult);
|
||||
|
||||
// Stop on first failure if requested
|
||||
if(options.stopOnFirstFailure && !testResult.passed && !testResult.skipped)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fork failed
|
||||
TestResult testResult;
|
||||
testResult.testName = testConfig.name;
|
||||
testResult.passed = false;
|
||||
testResult.skipped = false;
|
||||
testResult.exitCode = RCCL_TEST_INVALID;
|
||||
testResult.processId = RCCL_TEST_INVALID;
|
||||
testResult.duration = std::chrono::milliseconds(0);
|
||||
testResult.errorMessage = "Failed to fork process";
|
||||
|
||||
recordTestResult(testResult);
|
||||
INFO("Failed to fork process for test '%s'\n", testConfig.name.c_str());
|
||||
|
||||
if(options.stopOnFirstFailure)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool result = generateReport(options);
|
||||
|
||||
// Automatically clear test configurations and results after execution
|
||||
// This ensures a clean state for the next test suite without requiring
|
||||
// explicit clear() calls from test cases
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(testConfigsMutex_);
|
||||
testConfigs_.clear();
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(resultsMutex_);
|
||||
testResults_.clear();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Generate and display test report
|
||||
bool ProcessIsolatedTestRunner::generateReport(const ExecutionOptions& options)
|
||||
{
|
||||
int totalTests = 0;
|
||||
int passedTests = 0;
|
||||
int failedTests = 0;
|
||||
int skippedTests = 0;
|
||||
std::chrono::milliseconds totalDuration{0};
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(resultsMutex_);
|
||||
totalTests = testResults_.size();
|
||||
|
||||
for(const auto& result : testResults_)
|
||||
{
|
||||
if(result.skipped)
|
||||
{
|
||||
skippedTests++;
|
||||
}
|
||||
else if(result.passed)
|
||||
{
|
||||
passedTests++;
|
||||
}
|
||||
else
|
||||
{
|
||||
failedTests++;
|
||||
}
|
||||
totalDuration += result.duration;
|
||||
}
|
||||
}
|
||||
|
||||
// Report summary only if there are failures or multiple tests
|
||||
if(failedTests > 0 || totalTests > 1)
|
||||
{
|
||||
INFO(
|
||||
"Process-Isolated Tests: %d passed, %d failed, %d skipped (%ld ms total)\n",
|
||||
passedTests,
|
||||
failedTests,
|
||||
skippedTests,
|
||||
totalDuration.count()
|
||||
);
|
||||
|
||||
if(failedTests > 0)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(resultsMutex_);
|
||||
for(const auto& result : testResults_)
|
||||
{
|
||||
if(!result.passed && !result.skipped)
|
||||
{
|
||||
INFO(
|
||||
" Failed: %s - %s\n",
|
||||
result.testName.c_str(),
|
||||
result.errorMessage.c_str()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return failedTests == 0;
|
||||
}
|
||||
|
||||
// Get detailed test results (thread-safe)
|
||||
std::vector<ProcessIsolatedTestRunner::TestResult> ProcessIsolatedTestRunner::getTestResults()
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(resultsMutex_);
|
||||
return testResults_;
|
||||
}
|
||||
|
||||
// Clear test registry and results (thread-safe)
|
||||
void ProcessIsolatedTestRunner::clear()
|
||||
{
|
||||
size_t registeredCount = 0;
|
||||
size_t executedCount = 0;
|
||||
|
||||
// Check for unexecuted tests before clearing
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(testConfigsMutex_);
|
||||
registeredCount = testConfigs_.size();
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(resultsMutex_);
|
||||
executedCount = testResults_.size();
|
||||
}
|
||||
|
||||
// Warn if tests were registered but not all executed
|
||||
if(registeredCount > 0 && executedCount < registeredCount)
|
||||
{
|
||||
std::cerr << "\n⚠️ WARNING: ProcessIsolatedTestRunner::clear() called with "
|
||||
<< (registeredCount - executedCount) << " unexecuted test(s)!\n"
|
||||
<< " Registered: " << registeredCount << " test(s)\n"
|
||||
<< " Executed: " << executedCount << " test(s)\n"
|
||||
<< " Did you forget to call executeAllTests()?\n"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
// Clear the registrations and results
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(testConfigsMutex_);
|
||||
testConfigs_.clear();
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(resultsMutex_);
|
||||
testResults_.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// Get number of registered tests
|
||||
size_t ProcessIsolatedTestRunner::getTestCount()
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(testConfigsMutex_);
|
||||
return testConfigs_.size();
|
||||
}
|
||||
|
||||
} // namespace RcclUnitTesting
|
||||
@@ -0,0 +1,365 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <chrono>
|
||||
#include <cstdlib>
|
||||
#include <functional>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
|
||||
/**
|
||||
* @brief Generic thread-safe process isolated test runner
|
||||
*
|
||||
* This class provides a framework for running tests in isolated processes
|
||||
* with clean environment settings and sequential execution.
|
||||
*
|
||||
*/
|
||||
class ProcessIsolatedTestRunner
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* @brief Test execution result structure
|
||||
*/
|
||||
struct TestResult
|
||||
{
|
||||
std::string testName; ///< Name of the test
|
||||
bool passed; ///< Whether the test passed
|
||||
bool skipped; ///< Whether the test skipped
|
||||
int exitCode; ///< Process exit code
|
||||
pid_t processId; ///< Process ID that ran the test
|
||||
std::chrono::milliseconds duration; ///< Test execution duration
|
||||
std::string errorMessage; ///< Error message if test failed
|
||||
std::unordered_map<std::string, std::string> environment; ///< Environment variables used
|
||||
|
||||
/**
|
||||
* @brief Default constructor
|
||||
*/
|
||||
TestResult();
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Test configuration structure
|
||||
*/
|
||||
struct TestConfig
|
||||
{
|
||||
std::string name; ///< Test name
|
||||
std::function<void()> testLogic; ///< Test function to execute
|
||||
std::unordered_map<std::string, std::string>
|
||||
environmentVariables; ///< Environment variables to set
|
||||
std::chrono::seconds timeout; ///< Test timeout
|
||||
bool inheritParentEnv; ///< Whether to inherit parent environment
|
||||
std::vector<std::string> clearEnvVars; ///< Environment variables to explicitly clear
|
||||
|
||||
/**
|
||||
* @brief Constructor
|
||||
* @param testName Name of the test
|
||||
* @param logic Test function to execute
|
||||
*/
|
||||
TestConfig(const std::string& testName, std::function<void()> logic);
|
||||
|
||||
/**
|
||||
* @brief Set environment variables for this test
|
||||
* @param env Map of environment variable name-value pairs
|
||||
* @return Reference to this TestConfig for method chaining
|
||||
*/
|
||||
TestConfig& withEnvironment(const std::unordered_map<std::string, std::string>& env);
|
||||
|
||||
/**
|
||||
* @brief Set timeout for this test
|
||||
* @param timeoutSeconds Timeout in seconds
|
||||
* @return Reference to this TestConfig for method chaining
|
||||
*/
|
||||
TestConfig& withTimeout(std::chrono::seconds timeoutSeconds);
|
||||
|
||||
/**
|
||||
* @brief Configure environment inheritance
|
||||
* @param inherit Whether to inherit parent environment variables
|
||||
* @return Reference to this TestConfig for method chaining
|
||||
*/
|
||||
TestConfig& withCleanEnvironment(bool inherit = false);
|
||||
|
||||
/**
|
||||
* @brief Clear a specific environment variable
|
||||
* @param varName Name of the variable to clear
|
||||
* @return Reference to this TestConfig for method chaining
|
||||
*/
|
||||
TestConfig& clearVariable(const std::string& varName);
|
||||
|
||||
/**
|
||||
* @brief Set a specific environment variable
|
||||
* @param name Variable name
|
||||
* @param value Variable value
|
||||
* @return Reference to this TestConfig for method chaining
|
||||
*/
|
||||
TestConfig& setVariable(const std::string& name, const std::string& value);
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Execution options for test runner
|
||||
*/
|
||||
struct ExecutionOptions
|
||||
{
|
||||
bool stopOnFirstFailure; ///< Stop execution on first test failure
|
||||
bool verboseLogging; ///< Enable verbose logging
|
||||
|
||||
/**
|
||||
* @brief Default constructor with sensible defaults
|
||||
*/
|
||||
ExecutionOptions();
|
||||
};
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief Structure to hold captured process output
|
||||
*/
|
||||
struct CapturedOutput
|
||||
{
|
||||
std::string stdoutContent; ///< Captured stdout content
|
||||
std::string stderrContent; ///< Captured stderr content
|
||||
};
|
||||
|
||||
// Thread-safe static members for test management
|
||||
static std::mutex testConfigsMutex_;
|
||||
static std::vector<TestConfig> testConfigs_;
|
||||
static std::mutex resultsMutex_;
|
||||
static std::vector<TestResult> testResults_;
|
||||
|
||||
/**
|
||||
* @brief Apply environment variables to current process
|
||||
* @param config Test configuration containing environment settings
|
||||
*/
|
||||
static void applyEnvironmentVariables(const TestConfig& config);
|
||||
|
||||
/**
|
||||
* @brief Execute a single test in the child process
|
||||
* @param config Test configuration
|
||||
* @return Exit code (0 for success, non-zero for failure)
|
||||
*/
|
||||
static int runTestInProcess(const TestConfig& config);
|
||||
|
||||
/**
|
||||
* @brief Create pipes for capturing process output
|
||||
* @param stdoutPipe Array to hold stdout pipe file descriptors [read, write]
|
||||
* @param stderrPipe Array to hold stderr pipe file descriptors [read, write]
|
||||
* @return True if pipes were created successfully, false otherwise
|
||||
*/
|
||||
static bool createOutputPipes(int stdoutPipe[2], int stderrPipe[2]);
|
||||
|
||||
/**
|
||||
* @brief Redirect child process output to pipes
|
||||
* @param stdoutPipe Stdout pipe file descriptors [read, write]
|
||||
* @param stderrPipe Stderr pipe file descriptors [read, write]
|
||||
*/
|
||||
static void redirectOutputToPipes(int stdoutPipe[2], int stderrPipe[2]);
|
||||
|
||||
/**
|
||||
* @brief Capture output from child process via pipes
|
||||
* @param stdoutPipe Stdout pipe file descriptors [read, write]
|
||||
* @param stderrPipe Stderr pipe file descriptors [read, write]
|
||||
* @param pid Child process ID to monitor
|
||||
* @param status Pointer to status variable for waitpid
|
||||
* @return Captured output from stdout and stderr
|
||||
*/
|
||||
static CapturedOutput
|
||||
captureProcessOutput(int stdoutPipe[2], int stderrPipe[2], pid_t pid, int* status);
|
||||
|
||||
/**
|
||||
* @brief Display captured output with formatted delimiters
|
||||
* @param output Captured output to display
|
||||
* @param testName Name of the test for context
|
||||
*/
|
||||
static void displayCapturedOutput(const CapturedOutput& output, const std::string& testName);
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief Register a test configuration
|
||||
* @param config Complete test configuration
|
||||
*/
|
||||
static void registerTest(const TestConfig& config);
|
||||
|
||||
/**
|
||||
* @brief Register a simple test with just name and logic
|
||||
* @param name Test name
|
||||
* @param testLogic Test function to execute
|
||||
*/
|
||||
static void registerTest(const std::string& name, std::function<void()> testLogic);
|
||||
|
||||
/**
|
||||
* @brief Register a test with environment variables
|
||||
* @param name Test name
|
||||
* @param testLogic Test function to execute
|
||||
* @param env Environment variables to set for this test
|
||||
*/
|
||||
static void registerTest(
|
||||
const std::string& name,
|
||||
std::function<void()> testLogic,
|
||||
const std::unordered_map<std::string, std::string>& env
|
||||
);
|
||||
|
||||
/**
|
||||
* @brief Record a test result (thread-safe)
|
||||
* @param result Test result to record
|
||||
*/
|
||||
static void recordTestResult(const TestResult& result);
|
||||
|
||||
/**
|
||||
* @brief Execute all registered tests sequentially
|
||||
* @param options Execution options (defaults to continue on failure)
|
||||
* @return True if all tests passed, false otherwise
|
||||
* @note This method automatically clears all test registrations and results
|
||||
* after execution, ensuring a clean state for the next test suite.
|
||||
*/
|
||||
static bool executeAllTests(const ExecutionOptions& options = ExecutionOptions());
|
||||
|
||||
/**
|
||||
* @brief Generate and display test report
|
||||
* @param options Execution options used for the test run
|
||||
* @return True if all tests passed, false otherwise
|
||||
*/
|
||||
static bool generateReport(const ExecutionOptions& options);
|
||||
|
||||
/**
|
||||
* @brief Get detailed test results (thread-safe)
|
||||
* @return Vector of all test results
|
||||
*/
|
||||
static std::vector<TestResult> getTestResults();
|
||||
|
||||
/**
|
||||
* @brief Clear test registry and results (thread-safe)
|
||||
* @note Calling this method manually is typically not necessary, as
|
||||
* executeAllTests() automatically clears registrations after execution.
|
||||
* This method is primarily useful for advanced use cases or when tests
|
||||
* are registered but not executed.
|
||||
*/
|
||||
static void clear();
|
||||
|
||||
/**
|
||||
* @brief Get number of registered tests
|
||||
* @return Number of registered tests
|
||||
*/
|
||||
static size_t getTestCount();
|
||||
};
|
||||
|
||||
// Macros for Simplified Usage
|
||||
|
||||
/**
|
||||
* @brief Register and execute a single isolated test with minimal boilerplate
|
||||
*
|
||||
* Uses variadic macros to automatically handle commas in lambda bodies
|
||||
*
|
||||
* @param test_name Name of the test (string)
|
||||
* @param ... Lambda containing test logic (variadic to handle internal commas)
|
||||
*
|
||||
* Example:
|
||||
* RUN_ISOLATED_TEST("MyTest", []() {
|
||||
* EXPECT_TRUE(someFunction());
|
||||
* });
|
||||
*/
|
||||
#define RUN_ISOLATED_TEST(test_name, ...) \
|
||||
do \
|
||||
{ \
|
||||
::RcclUnitTesting::ProcessIsolatedTestRunner::registerTest(test_name, __VA_ARGS__); \
|
||||
bool passed_ = ::RcclUnitTesting::ProcessIsolatedTestRunner::executeAllTests(); \
|
||||
EXPECT_TRUE(passed_) << "Isolated test '" << test_name << "' failed"; \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
/**
|
||||
* @brief Register and execute a single isolated test with environment variables
|
||||
*
|
||||
* Uses variadic macros to automatically handle environment variable initializer lists
|
||||
*
|
||||
* @param test_name Name of the test (string)
|
||||
* @param test_body Lambda containing test logic
|
||||
* @param ... Environment variables as initializer list
|
||||
*
|
||||
* Example:
|
||||
* RUN_ISOLATED_TEST_WITH_ENV("MyTest",
|
||||
* []() { EXPECT_TRUE(someFunction()); },
|
||||
* {{"VAR1", "value1"}, {"VAR2", "value2"}});
|
||||
*
|
||||
* Note: Uses __VA_ARGS__ to capture environment variables, which automatically
|
||||
* handles commas in the initializer list without requiring extra parentheses.
|
||||
*/
|
||||
#define RUN_ISOLATED_TEST_WITH_ENV(test_name, test_body, ...) \
|
||||
do \
|
||||
{ \
|
||||
::RcclUnitTesting::ProcessIsolatedTestRunner::registerTest( \
|
||||
test_name, \
|
||||
test_body, \
|
||||
__VA_ARGS__ \
|
||||
); \
|
||||
bool passed_ = ::RcclUnitTesting::ProcessIsolatedTestRunner::executeAllTests(); \
|
||||
EXPECT_TRUE(passed_) << "Isolated test '" << test_name << "' failed"; \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
/**
|
||||
* @brief Register and execute multiple isolated tests with default options
|
||||
*
|
||||
* This macro takes multiple TestConfig objects and executes them all.
|
||||
* Tests are automatically cleaned up after execution.
|
||||
*
|
||||
* Example:
|
||||
* RUN_ISOLATED_TESTS(
|
||||
* ProcessIsolatedTestRunner::TestConfig("Test1", []() { ... }),
|
||||
* ProcessIsolatedTestRunner::TestConfig("Test2", []() { ... })
|
||||
* .withEnvironment({{"VAR", "value"}}),
|
||||
* ProcessIsolatedTestRunner::TestConfig("Test3", []() { ... })
|
||||
* .withTimeout(std::chrono::seconds(60))
|
||||
* );
|
||||
*/
|
||||
#define RUN_ISOLATED_TESTS(...) \
|
||||
do \
|
||||
{ \
|
||||
::RcclUnitTesting::ProcessIsolatedTestRunner::TestConfig configs_[] = {__VA_ARGS__}; \
|
||||
for(const auto& config_ : configs_) \
|
||||
{ \
|
||||
::RcclUnitTesting::ProcessIsolatedTestRunner::registerTest(config_); \
|
||||
} \
|
||||
bool passed_ = ::RcclUnitTesting::ProcessIsolatedTestRunner::executeAllTests(); \
|
||||
EXPECT_TRUE(passed_) << "One or more isolated tests failed"; \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
/**
|
||||
* @brief Register and execute multiple isolated tests with custom options
|
||||
*
|
||||
* This macro takes execution options and multiple TestConfig objects.
|
||||
*
|
||||
* Example:
|
||||
* ProcessIsolatedTestRunner::ExecutionOptions opts;
|
||||
* opts.stopOnFirstFailure = true;
|
||||
* opts.verboseLogging = true;
|
||||
*
|
||||
* RUN_ISOLATED_TESTS_WITH_OPTIONS(opts,
|
||||
* ProcessIsolatedTestRunner::TestConfig("Test1", []() { ... }),
|
||||
* ProcessIsolatedTestRunner::TestConfig("Test2", []() { ... })
|
||||
* );
|
||||
*/
|
||||
#define RUN_ISOLATED_TESTS_WITH_OPTIONS(options, ...) \
|
||||
do \
|
||||
{ \
|
||||
::RcclUnitTesting::ProcessIsolatedTestRunner::TestConfig configs_[] = {__VA_ARGS__}; \
|
||||
for(const auto& config_ : configs_) \
|
||||
{ \
|
||||
::RcclUnitTesting::ProcessIsolatedTestRunner::registerTest(config_); \
|
||||
} \
|
||||
bool passed_ = ::RcclUnitTesting::ProcessIsolatedTestRunner::executeAllTests(options); \
|
||||
EXPECT_TRUE(passed_) << "One or more isolated tests failed"; \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
} // namespace RcclUnitTesting
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Ссылка в новой задаче
Block a user