diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt index ddc3f26bfa..9de9ad0211 100644 --- a/projects/rccl/CMakeLists.txt +++ b/projects/rccl/CMakeLists.txt @@ -40,7 +40,6 @@ option(TRACE "Enable additional tracing" option(FAULT_INJECTION "Enable fault injection" ON) option(FORCE_REDUCE_PIPELINING "Force reduce pipelining" OFF) option(DISABLE_CHEAP_THREADFENCE "Compile-time killswitch for simpler fence" OFF) -option(RCCL_EXPOSE_STATIC "Expose internal static functions for testing" OFF) # Default GPU architectures to build #================================================================================================== @@ -372,9 +371,6 @@ if(ROCTX) endif() endif() -if(RCCL_EXPOSE_STATIC) - add_definitions(-DRCCL_EXPOSE_STATIC) -endif() # Determine version from makefiles/version.mk and fill in templates #================================================================================================== ## parse version from Makefile NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH must exist @@ -739,6 +735,9 @@ add_custom_target(hipify_all DEPENDS ${HIP_SOURCES}) if (BUILD_TESTS) if (ROCM_VERSION VERSION_GREATER_EQUAL "60400" AND CMAKE_BUILD_TYPE MATCHES "Debug") + ## Set definition for exposing rccl static function + add_definitions(-DRCCL_EXPOSE_STATIC) + set(HIPIFY_SRC_DIR "${PROJECT_BINARY_DIR}/hipify/src") set(REPLACE_SCRIPT "${CMAKE_SOURCE_DIR}/tools/scripts/replace_static.sh") message ("Replacing static functions in ${HIPIFY_SRC_DIR} with ${REPLACE_SCRIPT} for unit tests") diff --git a/projects/rccl/test/CMakeLists.txt b/projects/rccl/test/CMakeLists.txt index a3facb42c0..86347ff36a 100644 --- a/projects/rccl/test/CMakeLists.txt +++ b/projects/rccl/test/CMakeLists.txt @@ -143,6 +143,7 @@ if(BUILD_TESTS) ArgCheckTests.cpp IpcsocketTests.cpp CollRegTests.cpp + RcclWrapTests.cpp ShmTests.cpp P2pTests.cpp BitOpsTests.cpp diff --git a/projects/rccl/test/RcclWrapTests.cpp b/projects/rccl/test/RcclWrapTests.cpp new file mode 100644 index 0000000000..99324a1ca7 --- /dev/null +++ b/projects/rccl/test/RcclWrapTests.cpp @@ -0,0 +1,173 @@ +/************************************************************************* + * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include +#include +#include + +#include "TestBed.hpp" + + +namespace RcclUnitTesting +{ + + // Helper function to test the static expose check + ncclResult_t testStaticExposeCheck() { + RCCL_STATIC_EXPOSE_CHECK(); + return ncclSuccess; + } + + TEST(Rcclwrap, RcclFuncMaxSendRecvCount) { + ncclResult_t staticCheckResult = testStaticExposeCheck(); + #ifdef RCCL_EXPOSE_STATIC + EXPECT_EQ(staticCheckResult, ncclSuccess); + #else + EXPECT_EQ(staticCheckResult, ncclInvalidUsage); + #endif + + size_t maxCount = 0; + ncclResult_t result = rcclFuncMaxSendRecvCount(ncclFuncAllReduce, 4, 1024, maxCount); + EXPECT_EQ(maxCount, 1024); + EXPECT_EQ(result, ncclSuccess); + } + + TEST(Rcclwrap, RcclUpdateCollectiveProtocol_UsesLL128WhenInRange) { + setenv("NCCL_PROTO", "", 1); // Trigger auto selection mode + unsetenv("NCCL_PROTO"); + + ncclComm_t comm = new ncclComm(); + *comm = {}; + // Manually populate minimal fields for comm + comm->nRanks = 1; + comm->nNodes = 2; // triggers inter-node logic + comm->rank=0; + comm->topo = new ncclTopoSystem(); + *comm->topo = {}; + comm->topo->ll128Enabled=true; + comm->topo->nodes[GPU].nodes[0] = {}; + comm->topo->nodes[GPU].count = 1; + strncpy(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942", sizeof(comm->topo->nodes[GPU].nodes[0].gpu.gcn)); + + int idx = rcclGetTunableIndex(ncclFuncAllReduce); + comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MIN_IDX] = 512; + comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MAX_IDX] = 1024; + comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_MIN_IDX] = 256; + comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_MAX_IDX] = 2048; + comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_FACTOR_IDX] = 1; + + ncclTaskColl info = {}; + // Manually populate minimal fields for info + info.func = ncclFuncAllReduce; + info.protocol = NCCL_PROTO_UNDEF; + + size_t nBytes = 1024; + + rcclUpdateCollectiveProtocol(comm, nBytes, &info); + EXPECT_TRUE(info.protocol == NCCL_PROTO_LL128 || info.protocol == NCCL_PROTO_LL); + + delete comm->topo; + delete comm; + } + + TEST(Rcclwrap, RcclUpdateCollectiveProtocol_WarnsOnGfx942Arch) { + setenv("NCCL_PROTO", "", 1); + unsetenv("NCCL_PROTO"); + + ncclComm_t comm = new ncclComm(); + *comm = {}; + // Manually populate minimal fields for comm + comm->nRanks = 1; + comm->nNodes = 2; // triggers inter-node logic + comm->rank=0; + comm->topo = new ncclTopoSystem(); + comm->topo->ll128Enabled=true; + comm->topo->nodes[GPU].nodes[0] = {}; + strncpy(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942", sizeof(comm->topo->nodes[GPU].nodes[0].gpu.gcn)); + + int idx = rcclGetTunableIndex(ncclFuncAllReduce); + comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MIN_IDX] = RCCL_LL_LIMITS_UNDEFINED; + comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MAX_IDX] = RCCL_LL_LIMITS_UNDEFINED; + comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_MIN_IDX] = RCCL_LL_LIMITS_UNDEFINED; + comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_MAX_IDX] = RCCL_LL_LIMITS_UNDEFINED; + comm->minMaxLLRange[idx][NCCL_PROTO_LL128][RCCL_PROTOCOL_FACTOR_IDX] = RCCL_LL_LIMITS_UNDEFINED; + + ncclTaskColl info = {}; + // Manually populate minimal fields for info + info.func = ncclFuncAllReduce; + info.protocol = NCCL_PROTO_UNDEF; + size_t nBytes = 1024; // 1024 per rank for 4 ranks + + rcclUpdateCollectiveProtocol(comm, nBytes, &info); + EXPECT_EQ(info.protocol, NCCL_PROTO_UNDEF); + + delete comm->topo; + delete comm; +} + +TEST(Rcclwrap, RcclUpdateCollectiveProtocol_HonorsUserProtocolEnv) { //Why does this pass if it does not enter the else if block + setenv("NCCL_PROTO", "1", 1); // Simulate manual override + + ncclComm_t comm = new ncclComm(); + *comm = {}; + // Manually populate minimal fields for comm + comm->nRanks = 1; + comm->nNodes = 2; // triggers inter-node logic + comm->rank=0; + comm->topo = new ncclTopoSystem(); //(struct ncclTopoSystem*)calloc(1, sizeof(struct ncclTopoSystem)); + *comm->topo = {}; + comm->topo->ll128Enabled=true; + comm->topo->nodes[GPU].nodes[0] = {}; + strncpy(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942", sizeof(comm->topo->nodes[GPU].nodes[0].gpu.gcn)); + + ncclTaskColl info = {}; + // Manually populate minimal fields for info + info.func = ncclFuncAllReduce; + info.protocol = NCCL_PROTO_UNDEF; + size_t nBytes = 1024; // 1024 per rank for 4 ranks + + rcclUpdateCollectiveProtocol(comm, nBytes, &info); + EXPECT_EQ(info.protocol, NCCL_PROTO_UNDEF); + + delete comm->topo; + delete comm; +} + +TEST(Rcclwrap, RcclUpdateCollectiveProtocol_SimpleFallbackWhenNoRanges) { + setenv("NCCL_PROTO", "", 1); // Trigger auto selection mode + unsetenv("NCCL_PROTO"); + + ncclComm_t comm = new ncclComm(); + *comm = {}; + // Manually populate minimal fields for comm + comm->nRanks = 1; + comm->nNodes = 2; // triggers inter-node logic + comm->rank=0; + comm->topo = new ncclTopoSystem(); //(struct ncclTopoSystem*)calloc(1, sizeof(struct ncclTopoSystem)); + *comm->topo = {}; + comm->topo->ll128Enabled=true; + comm->topo->nodes[GPU].nodes[0] = {}; + comm->topo->nodes[GPU].count = 1; + strncpy(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942", sizeof(comm->topo->nodes[GPU].nodes[0].gpu.gcn)); + + int idx = rcclGetTunableIndex(ncclFuncAllReduce); + comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MIN_IDX] = 512; + comm->minMaxLLRange[idx][NCCL_PROTO_LL][RCCL_PROTOCOL_MAX_IDX] = 1024; + + + // Manually populate minimal fields for info + ncclTaskColl info = {}; + info.func = ncclFuncAllReduce; + info.protocol = NCCL_PROTO_UNDEF; + size_t nBytes = 2048; // 1024 per rank for 4 ranks + + rcclUpdateCollectiveProtocol(comm, nBytes, &info); + EXPECT_EQ(info.protocol, NCCL_PROTO_SIMPLE); + + delete comm->topo; + delete comm; +} +} //RcclUnitTesting