Added tests for transport.cc (#1725)
Co-authored-by: Welling <awelling@ctr2-alola-login-01.amd.com>
This commit is contained in:
@@ -145,6 +145,7 @@ if(BUILD_TESTS)
|
||||
ShmTests.cpp
|
||||
P2pTests.cpp
|
||||
BitOpsTests.cpp
|
||||
TransportTests.cpp
|
||||
common/main_fixtures.cpp
|
||||
common/EnvVars.cpp
|
||||
)
|
||||
|
||||
@@ -0,0 +1,243 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <rccl/rccl.h>
|
||||
#include "TestBed.hpp"
|
||||
#include "TransportUtils.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
|
||||
TEST(TransportTest, CollNetRecvSetup) {
|
||||
constexpr int nranks = 2;
|
||||
constexpr int nNodes = 2;
|
||||
|
||||
// --- Setup comm ---
|
||||
ncclComm comm = {};
|
||||
comm.rank = 0;
|
||||
comm.nRanks = nranks;
|
||||
comm.nNodes = nNodes;
|
||||
comm.node = 0;
|
||||
ncclPeerInfo peerInfo[3] = {};
|
||||
comm.peerInfo = peerInfo;
|
||||
|
||||
// --- Setup channel ---
|
||||
ncclChannel channel = {};
|
||||
static ncclChannelPeer peerArray[3];
|
||||
static ncclChannelPeer* peerPtrs[3] = {
|
||||
&peerArray[0], &peerArray[1], &peerArray[2]
|
||||
};
|
||||
channel.peers = peerPtrs;
|
||||
|
||||
// Step 1: Allocate device-side array of ncclDevChannelPeer
|
||||
ncclDevChannelPeer* devPeerArrayDevice;
|
||||
hipMalloc(&devPeerArrayDevice, sizeof(ncclDevChannelPeer) * 3);
|
||||
|
||||
// Step 2: Create host-side array of device pointers
|
||||
ncclDevChannelPeer* devPeerPtrsHost[3] = {
|
||||
devPeerArrayDevice + 0,
|
||||
devPeerArrayDevice + 1,
|
||||
devPeerArrayDevice + 2
|
||||
};
|
||||
|
||||
// Step 3: Allocate device-side array of device pointers
|
||||
ncclDevChannelPeer** devPeerPtrsDevice;
|
||||
hipMalloc(&devPeerPtrsDevice, sizeof(ncclDevChannelPeer*) * 3);
|
||||
|
||||
// Step 4: Copy host-side array of device pointers to device
|
||||
hipMemcpy(devPeerPtrsDevice, devPeerPtrsHost, sizeof(ncclDevChannelPeer*) * 3, hipMemcpyHostToDevice);
|
||||
|
||||
// Step 5: Set in channel
|
||||
channel.devPeers = devPeerPtrsDevice;
|
||||
// --- Setup transportComm ---
|
||||
static struct ncclTransportComm dummyTransport = {
|
||||
.setup = mockSetup,
|
||||
.connect = mockConnect,
|
||||
};
|
||||
collNetTransport.recv = dummyTransport;
|
||||
|
||||
// --- Dummy inputs ---
|
||||
ncclTopoGraph topoGraph = {};
|
||||
ncclConnect connect = {};
|
||||
int masterRank = 0;
|
||||
int masterPeer = 1;
|
||||
int channelId = 0;
|
||||
int type = collNetRecv;
|
||||
|
||||
// --- Run the function ---
|
||||
bool failed = ncclTransportCollNetSetup(&comm, &topoGraph, &channel, masterRank, masterPeer, channelId, type, &connect);
|
||||
|
||||
// --- Assert: function should succeed (return false) ---
|
||||
ASSERT_FALSE(failed);
|
||||
|
||||
// --- Cleanup ---
|
||||
hipFree(devPeerArrayDevice);
|
||||
hipFree(devPeerPtrsDevice);
|
||||
}
|
||||
|
||||
TEST(TransportTest, CollNetSendSetup) {
|
||||
constexpr int nranks = 2;
|
||||
constexpr int nNodes = 2;
|
||||
|
||||
// --- Setup comm ---
|
||||
ncclComm comm = {};
|
||||
comm.rank = 0;
|
||||
comm.nRanks = nranks;
|
||||
comm.nNodes = nNodes;
|
||||
comm.node = 0;
|
||||
|
||||
ncclPeerInfo peerInfo[3] = {};
|
||||
comm.peerInfo = peerInfo;
|
||||
|
||||
// --- Setup channel ---
|
||||
ncclChannel channel = {};
|
||||
static ncclChannelPeer peerArray[3];
|
||||
static ncclChannelPeer* peerPtrs[3] = {
|
||||
&peerArray[0], &peerArray[1], &peerArray[2]
|
||||
};
|
||||
channel.peers = peerPtrs;
|
||||
|
||||
// Step 1: Allocate device-side array of ncclDevChannelPeer
|
||||
ncclDevChannelPeer* devPeerArrayDevice;
|
||||
hipMalloc(&devPeerArrayDevice, sizeof(ncclDevChannelPeer) * 3);
|
||||
|
||||
// Step 2: Create host-side array of device pointers
|
||||
ncclDevChannelPeer* devPeerPtrsHost[3] = {
|
||||
devPeerArrayDevice + 0,
|
||||
devPeerArrayDevice + 1,
|
||||
devPeerArrayDevice + 2
|
||||
};
|
||||
|
||||
// Step 3: Allocate device-side array of device pointers
|
||||
ncclDevChannelPeer** devPeerPtrsDevice;
|
||||
hipMalloc(&devPeerPtrsDevice, sizeof(ncclDevChannelPeer*) * 3);
|
||||
|
||||
// Step 4: Copy host-side array of device pointers to device
|
||||
hipMemcpy(devPeerPtrsDevice, devPeerPtrsHost, sizeof(ncclDevChannelPeer*) * 3, hipMemcpyHostToDevice);
|
||||
|
||||
// Step 5: Set in channel
|
||||
channel.devPeers = devPeerPtrsDevice;
|
||||
|
||||
// --- Setup transportComm ---
|
||||
static struct ncclTransportComm dummyTransport = {
|
||||
.setup = mockSetup,
|
||||
.connect = mockConnect,
|
||||
};
|
||||
collNetTransport.send = dummyTransport;
|
||||
|
||||
// --- Dummy inputs ---
|
||||
ncclTopoGraph topoGraph = {};
|
||||
ncclConnect connect = {}; // IMPORTANT: non-null since this is memcpy’d into masterConnects
|
||||
int masterRank = 0;
|
||||
int masterPeer = 1;
|
||||
int channelId = 0;
|
||||
int type = collNetSend;
|
||||
|
||||
// --- Run the function ---
|
||||
bool failed = ncclTransportCollNetSetup(&comm, &topoGraph, &channel, masterRank, masterPeer, channelId, type, &connect);
|
||||
|
||||
// --- Assert: function should succeed (return false) ---
|
||||
ASSERT_FALSE(failed);
|
||||
|
||||
// --- Cleanup ---
|
||||
hipFree(devPeerArrayDevice);
|
||||
hipFree(devPeerPtrsDevice);
|
||||
}
|
||||
|
||||
|
||||
TEST(TransportTest, NcclTransportCollNetCheckTestSuccess) {
|
||||
struct ncclComm comm = {};
|
||||
struct ncclBootstrap dummyBootstrap;
|
||||
|
||||
int rankMap[1] = {0};
|
||||
comm.localRank = 0;
|
||||
comm.localRanks = 1;
|
||||
comm.localRankToRank = rankMap;
|
||||
comm.bootstrap = &dummyBootstrap;
|
||||
int collNetSetupFail = 0;
|
||||
ncclResult_t result = ncclTransportCollNetCheck(&comm, collNetSetupFail);
|
||||
EXPECT_EQ(result, ncclSuccess);
|
||||
}
|
||||
|
||||
TEST(TransportTest, NcclTransportCollNetCheckTestFails) {
|
||||
ncclComm comm;
|
||||
int rankMap[1] = {0};
|
||||
ncclBootstrap bootstrap;
|
||||
|
||||
comm.localRank = 0;
|
||||
comm.localRanks = 1;
|
||||
comm.localRankToRank = rankMap;
|
||||
comm.bootstrap = &bootstrap;
|
||||
|
||||
int collNetSetupFail = 1; // simulate failure on this rank
|
||||
ncclResult_t result = ncclTransportCollNetCheck(&comm, collNetSetupFail);
|
||||
EXPECT_EQ(result, ncclSystemError);
|
||||
}
|
||||
|
||||
// Test for ncclTransportCollNetFree
|
||||
TEST(TransportTest, CollNetFreeTest) {
|
||||
ncclComm comm = {};
|
||||
comm.nChannels = 1;
|
||||
comm.nRanks = 0; // So comm.channels[0].peers[0] is accessed
|
||||
|
||||
// Access embedded array directly (don't assign to comm.channels)
|
||||
ncclChannel* channel = &comm.channels[0];
|
||||
|
||||
// Allocate peer array for the channel (comm.nRanks + 1 for dummy peer)
|
||||
channel->peers = new ncclChannelPeer*[comm.nRanks + 1];
|
||||
for (int r = 0; r <= comm.nRanks; ++r) {
|
||||
channel->peers[r] = nullptr;
|
||||
}
|
||||
|
||||
// Create dummy peer at index nRanks
|
||||
ncclChannelPeer* peer = new ncclChannelPeer();
|
||||
channel->peers[comm.nRanks] = peer;
|
||||
peer->refCount = 1;
|
||||
|
||||
// Setup dummy ncclTransportComm with only `free` implemented
|
||||
static ncclTransportComm dummyTransportComm = {};
|
||||
dummyTransportComm.free = [](ncclConnector* conn) -> ncclResult_t {
|
||||
if (conn && conn->transportResources) {
|
||||
free(conn->transportResources);
|
||||
conn->transportResources = nullptr;
|
||||
}
|
||||
return ncclSuccess;
|
||||
};
|
||||
|
||||
// Fill in dummy send and recv connectors
|
||||
for (int b = 0; b < NCCL_MAX_CONNS; ++b) {
|
||||
peer->send[b].transportResources = malloc(1);
|
||||
peer->send[b].transportComm = &dummyTransportComm;
|
||||
|
||||
peer->recv[b].transportResources = malloc(1);
|
||||
peer->recv[b].transportComm = &dummyTransportComm;
|
||||
}
|
||||
|
||||
// Call the function under test
|
||||
ncclResult_t result = ncclTransportCollNetFree(&comm);
|
||||
ASSERT_EQ(result, ncclSuccess);
|
||||
|
||||
// Clean up
|
||||
delete peer;
|
||||
delete[] channel->peers;
|
||||
channel->peers = nullptr;
|
||||
}
|
||||
|
||||
TEST(TransportTest, DumpDataTest) {
|
||||
ncclConnect conn;
|
||||
memset(&conn, 0xAB, sizeof(conn)); // Fill with a known byte pattern
|
||||
|
||||
auto output = captureStdout([&]() {
|
||||
dumpData(&conn, 1);
|
||||
});
|
||||
|
||||
// Basic checks on output
|
||||
EXPECT_TRUE(output.find("[0]") != std::string::npos); // Label exists
|
||||
EXPECT_GT(output.length(), 10); // Should print something
|
||||
EXPECT_NE(output.find("ab"), std::string::npos); // Should include 0xAB hex
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TRANSPORT_UTILS_H
|
||||
#define TRANSPORT_UTILS_H
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <rccl/rccl.h>
|
||||
#include <transport.h>
|
||||
#include "TestBed.hpp"
|
||||
|
||||
void dumpData(struct ncclConnect* data, int ndata);
|
||||
ncclResult_t bootstrapAllGather(void* bootstrap, void* data, int size) {
|
||||
memcpy((char*)data + size, data, size); // Simulate copying rank 0 connect to rank 1
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
//Mock functions for CollNetRecvSetup and CollNetSendSetup
|
||||
ncclResult_t mockSetup(struct ncclComm* comm, struct ncclTopoGraph* graph,
|
||||
struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
|
||||
struct ncclConnect* connect, struct ncclConnector* connector,
|
||||
int channelId, int type) {
|
||||
memset(connect, 42, sizeof(struct ncclConnect)); // dummy data
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t mockConnect(struct ncclComm* comm, struct ncclConnect* connect,
|
||||
int nranks, int rank, struct ncclConnector* connector) {
|
||||
memset(&connector->conn, 99, sizeof(connector->conn)); // dummy
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Dummy bootstrap implementation for testing NcclTransportCollNetCheckTestSuccess and NcclTransportCollNetCheckTestFails
|
||||
struct ncclBootstrap {};
|
||||
|
||||
ncclResult_t bootstrapIntraNodeAllGather(
|
||||
struct ncclBootstrap* bootstrap,
|
||||
int* localRankToRank,
|
||||
int localRank,
|
||||
int localRanks,
|
||||
int* data,
|
||||
size_t size
|
||||
) {
|
||||
data[0] = 0; // Rank 0 is fine
|
||||
data[1] = 1; // Rank 1 reports failure
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
//Helper function for capturing the output for DumpDataTest
|
||||
std::string captureStdout(std::function<void()> func) {
|
||||
int pipefd[2];
|
||||
pipe(pipefd);
|
||||
|
||||
int saved_stdout = dup(STDOUT_FILENO);
|
||||
dup2(pipefd[1], STDOUT_FILENO);
|
||||
close(pipefd[1]);
|
||||
|
||||
func();
|
||||
|
||||
fflush(stdout);
|
||||
dup2(saved_stdout, STDOUT_FILENO);
|
||||
close(saved_stdout);
|
||||
|
||||
char buffer[4096];
|
||||
ssize_t count = read(pipefd[0], buffer, sizeof(buffer) - 1);
|
||||
close(pipefd[0]);
|
||||
buffer[count] = '\0';
|
||||
|
||||
return std::string(buffer);
|
||||
}
|
||||
|
||||
} //namespace RcclUnitTesting
|
||||
#endif
|
||||
Reference in New Issue
Block a user