diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 602d44fd4e..7e139215df 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -145,6 +145,7 @@ if(BUILD_TESTS) ShmTests.cpp P2pTests.cpp BitOpsTests.cpp + TransportTests.cpp common/main_fixtures.cpp common/EnvVars.cpp ) diff --git a/test/TransportTests.cpp b/test/TransportTests.cpp new file mode 100644 index 0000000000..a7c673fba6 --- /dev/null +++ b/test/TransportTests.cpp @@ -0,0 +1,243 @@ +/************************************************************************* + * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include +#include "TestBed.hpp" +#include "TransportUtils.hpp" + +namespace RcclUnitTesting +{ + +TEST(TransportTest, CollNetRecvSetup) { + constexpr int nranks = 2; + constexpr int nNodes = 2; + + // --- Setup comm --- + ncclComm comm = {}; + comm.rank = 0; + comm.nRanks = nranks; + comm.nNodes = nNodes; + comm.node = 0; + ncclPeerInfo peerInfo[3] = {}; + comm.peerInfo = peerInfo; + + // --- Setup channel --- + ncclChannel channel = {}; + static ncclChannelPeer peerArray[3]; + static ncclChannelPeer* peerPtrs[3] = { + &peerArray[0], &peerArray[1], &peerArray[2] + }; + channel.peers = peerPtrs; + + // Step 1: Allocate device-side array of ncclDevChannelPeer + ncclDevChannelPeer* devPeerArrayDevice; + hipMalloc(&devPeerArrayDevice, sizeof(ncclDevChannelPeer) * 3); + + // Step 2: Create host-side array of device pointers + ncclDevChannelPeer* devPeerPtrsHost[3] = { + devPeerArrayDevice + 0, + devPeerArrayDevice + 1, + devPeerArrayDevice + 2 + }; + + // Step 3: Allocate device-side array of device pointers + ncclDevChannelPeer** devPeerPtrsDevice; + hipMalloc(&devPeerPtrsDevice, sizeof(ncclDevChannelPeer*) * 3); + + // Step 4: Copy host-side array of device pointers to device + hipMemcpy(devPeerPtrsDevice, devPeerPtrsHost, sizeof(ncclDevChannelPeer*) * 3, hipMemcpyHostToDevice); + + // Step 5: Set in channel + channel.devPeers = devPeerPtrsDevice; + // --- Setup transportComm --- + static struct ncclTransportComm dummyTransport = { + .setup = mockSetup, + .connect = mockConnect, + }; + collNetTransport.recv = dummyTransport; + + // --- Dummy inputs --- + ncclTopoGraph topoGraph = {}; + ncclConnect connect = {}; + int masterRank = 0; + int masterPeer = 1; + int channelId = 0; + int type = collNetRecv; + + // --- Run the function --- + bool failed = ncclTransportCollNetSetup(&comm, &topoGraph, &channel, masterRank, masterPeer, channelId, type, &connect); + + // --- Assert: function should succeed (return false) --- + ASSERT_FALSE(failed); + + // --- Cleanup --- + hipFree(devPeerArrayDevice); + hipFree(devPeerPtrsDevice); +} + +TEST(TransportTest, CollNetSendSetup) { + constexpr int nranks = 2; + constexpr int nNodes = 2; + + // --- Setup comm --- + ncclComm comm = {}; + comm.rank = 0; + comm.nRanks = nranks; + comm.nNodes = nNodes; + comm.node = 0; + + ncclPeerInfo peerInfo[3] = {}; + comm.peerInfo = peerInfo; + + // --- Setup channel --- + ncclChannel channel = {}; + static ncclChannelPeer peerArray[3]; + static ncclChannelPeer* peerPtrs[3] = { + &peerArray[0], &peerArray[1], &peerArray[2] + }; + channel.peers = peerPtrs; + + // Step 1: Allocate device-side array of ncclDevChannelPeer + ncclDevChannelPeer* devPeerArrayDevice; + hipMalloc(&devPeerArrayDevice, sizeof(ncclDevChannelPeer) * 3); + + // Step 2: Create host-side array of device pointers + ncclDevChannelPeer* devPeerPtrsHost[3] = { + devPeerArrayDevice + 0, + devPeerArrayDevice + 1, + devPeerArrayDevice + 2 + }; + + // Step 3: Allocate device-side array of device pointers + ncclDevChannelPeer** devPeerPtrsDevice; + hipMalloc(&devPeerPtrsDevice, sizeof(ncclDevChannelPeer*) * 3); + + // Step 4: Copy host-side array of device pointers to device + hipMemcpy(devPeerPtrsDevice, devPeerPtrsHost, sizeof(ncclDevChannelPeer*) * 3, hipMemcpyHostToDevice); + + // Step 5: Set in channel + channel.devPeers = devPeerPtrsDevice; + + // --- Setup transportComm --- + static struct ncclTransportComm dummyTransport = { + .setup = mockSetup, + .connect = mockConnect, + }; + collNetTransport.send = dummyTransport; + + // --- Dummy inputs --- + ncclTopoGraph topoGraph = {}; + ncclConnect connect = {}; // IMPORTANT: non-null since this is memcpy’d into masterConnects + int masterRank = 0; + int masterPeer = 1; + int channelId = 0; + int type = collNetSend; + + // --- Run the function --- + bool failed = ncclTransportCollNetSetup(&comm, &topoGraph, &channel, masterRank, masterPeer, channelId, type, &connect); + + // --- Assert: function should succeed (return false) --- + ASSERT_FALSE(failed); + + // --- Cleanup --- + hipFree(devPeerArrayDevice); + hipFree(devPeerPtrsDevice); +} + + + TEST(TransportTest, NcclTransportCollNetCheckTestSuccess) { + struct ncclComm comm = {}; + struct ncclBootstrap dummyBootstrap; + + int rankMap[1] = {0}; + comm.localRank = 0; + comm.localRanks = 1; + comm.localRankToRank = rankMap; + comm.bootstrap = &dummyBootstrap; + int collNetSetupFail = 0; + ncclResult_t result = ncclTransportCollNetCheck(&comm, collNetSetupFail); + EXPECT_EQ(result, ncclSuccess); + } + + TEST(TransportTest, NcclTransportCollNetCheckTestFails) { + ncclComm comm; + int rankMap[1] = {0}; + ncclBootstrap bootstrap; + + comm.localRank = 0; + comm.localRanks = 1; + comm.localRankToRank = rankMap; + comm.bootstrap = &bootstrap; + + int collNetSetupFail = 1; // simulate failure on this rank + ncclResult_t result = ncclTransportCollNetCheck(&comm, collNetSetupFail); + EXPECT_EQ(result, ncclSystemError); +} + +// Test for ncclTransportCollNetFree +TEST(TransportTest, CollNetFreeTest) { + ncclComm comm = {}; + comm.nChannels = 1; + comm.nRanks = 0; // So comm.channels[0].peers[0] is accessed + + // Access embedded array directly (don't assign to comm.channels) + ncclChannel* channel = &comm.channels[0]; + + // Allocate peer array for the channel (comm.nRanks + 1 for dummy peer) + channel->peers = new ncclChannelPeer*[comm.nRanks + 1]; + for (int r = 0; r <= comm.nRanks; ++r) { + channel->peers[r] = nullptr; + } + + // Create dummy peer at index nRanks + ncclChannelPeer* peer = new ncclChannelPeer(); + channel->peers[comm.nRanks] = peer; + peer->refCount = 1; + + // Setup dummy ncclTransportComm with only `free` implemented + static ncclTransportComm dummyTransportComm = {}; + dummyTransportComm.free = [](ncclConnector* conn) -> ncclResult_t { + if (conn && conn->transportResources) { + free(conn->transportResources); + conn->transportResources = nullptr; + } + return ncclSuccess; + }; + + // Fill in dummy send and recv connectors + for (int b = 0; b < NCCL_MAX_CONNS; ++b) { + peer->send[b].transportResources = malloc(1); + peer->send[b].transportComm = &dummyTransportComm; + + peer->recv[b].transportResources = malloc(1); + peer->recv[b].transportComm = &dummyTransportComm; + } + + // Call the function under test + ncclResult_t result = ncclTransportCollNetFree(&comm); + ASSERT_EQ(result, ncclSuccess); + + // Clean up + delete peer; + delete[] channel->peers; + channel->peers = nullptr; +} + +TEST(TransportTest, DumpDataTest) { + ncclConnect conn; + memset(&conn, 0xAB, sizeof(conn)); // Fill with a known byte pattern + + auto output = captureStdout([&]() { + dumpData(&conn, 1); + }); + + // Basic checks on output + EXPECT_TRUE(output.find("[0]") != std::string::npos); // Label exists + EXPECT_GT(output.length(), 10); // Should print something + EXPECT_NE(output.find("ab"), std::string::npos); // Should include 0xAB hex +} +} diff --git a/test/common/TransportUtils.hpp b/test/common/TransportUtils.hpp new file mode 100644 index 0000000000..ad1572dc65 --- /dev/null +++ b/test/common/TransportUtils.hpp @@ -0,0 +1,77 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#ifndef TRANSPORT_UTILS_H +#define TRANSPORT_UTILS_H + +#include +#include +#include +#include "TestBed.hpp" + +void dumpData(struct ncclConnect* data, int ndata); +ncclResult_t bootstrapAllGather(void* bootstrap, void* data, int size) { + memcpy((char*)data + size, data, size); // Simulate copying rank 0 connect to rank 1 + return ncclSuccess; +} + +namespace RcclUnitTesting +{ +//Mock functions for CollNetRecvSetup and CollNetSendSetup +ncclResult_t mockSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, + struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, + struct ncclConnect* connect, struct ncclConnector* connector, + int channelId, int type) { + memset(connect, 42, sizeof(struct ncclConnect)); // dummy data + return ncclSuccess; +} + +ncclResult_t mockConnect(struct ncclComm* comm, struct ncclConnect* connect, + int nranks, int rank, struct ncclConnector* connector) { + memset(&connector->conn, 99, sizeof(connector->conn)); // dummy + return ncclSuccess; +} + +// Dummy bootstrap implementation for testing NcclTransportCollNetCheckTestSuccess and NcclTransportCollNetCheckTestFails +struct ncclBootstrap {}; + +ncclResult_t bootstrapIntraNodeAllGather( +struct ncclBootstrap* bootstrap, +int* localRankToRank, +int localRank, +int localRanks, +int* data, +size_t size +) { + data[0] = 0; // Rank 0 is fine + data[1] = 1; // Rank 1 reports failure + return ncclSuccess; +} + +//Helper function for capturing the output for DumpDataTest +std::string captureStdout(std::function func) { + int pipefd[2]; + pipe(pipefd); + + int saved_stdout = dup(STDOUT_FILENO); + dup2(pipefd[1], STDOUT_FILENO); + close(pipefd[1]); + + func(); + + fflush(stdout); + dup2(saved_stdout, STDOUT_FILENO); + close(saved_stdout); + + char buffer[4096]; + ssize_t count = read(pipefd[0], buffer, sizeof(buffer) - 1); + close(pipefd[0]); + buffer[count] = '\0'; + + return std::string(buffer); +} + +} //namespace RcclUnitTesting +#endif