Files
rocm-systems/test/StandaloneTests.cpp
T
2025-04-30 18:01:11 -05:00

374 rivejä
13 KiB
C++

/*************************************************************************
* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <gtest/gtest.h>
#include <rccl/rccl.h>
#include "TestBed.hpp"
#include "StandaloneUtils.hpp"
namespace RcclUnitTesting
{
/**
* \brief Verify that each device is assigned to the right rank using ncclCommSplit API.
* ******************************************************************************************/
TEST(Standalone, SplitComms_RankCheck)
{
// Check for multi-gpu
int numDevices;
HIPCALL(hipGetDeviceCount(&numDevices));
if (numDevices < 2) {
GTEST_SKIP() << "This test requires at least 2 devices.";
}
// Initialize the original comms
std::vector<ncclComm_t> comms(numDevices);
NCCLCHECK(ncclCommInitAll(comms.data(), numDevices, nullptr));
// Split into new comms (round-robin)
std::vector<ncclComm_t> subComms(numDevices);
int numSubComms = 2;
std::map<int, int> mapCounter;
NCCLCHECK(ncclGroupStart());
for (int localRank = 0; localRank < numDevices; localRank++) {
NCCLCHECK(ncclCommSplit(comms[localRank], localRank % numSubComms, localRank, &subComms[localRank], NULL));
mapCounter[localRank % numSubComms]++;
}
NCCLCHECK(ncclGroupEnd());
// Check that new comms have correct subranks / ranks
for (int i = 0; i < numDevices; i++) {
int subCommRank, subCommNRank;
NCCLCHECK(ncclCommUserRank(subComms[i], &subCommRank));
NCCLCHECK(ncclCommCount(subComms[i], &subCommNRank));
ASSERT_EQ(subCommRank, i / numSubComms);
ASSERT_EQ(subCommNRank, mapCounter[i % numSubComms]);
}
// Clean up comms
for (auto& subComm : subComms)
NCCLCHECK(ncclCommDestroy(subComm));
for (auto& comm : comms)
NCCLCHECK(ncclCommDestroy(comm));
}
/**
* \brief Creates a communicator for each device and gathers them all in one rank.
* ******************************************************************************************/
TEST(Standalone, SplitComms_OneColor)
{
// Check for multi-gpu
int numDevices;
HIPCALL(hipGetDeviceCount(&numDevices));
if (numDevices < 2) {
GTEST_SKIP() << "This test requires at least 2 devices.";
}
// Initialize the original comms
std::vector<ncclComm_t> comms(numDevices);
NCCLCHECK(ncclCommInitAll(comms.data(), numDevices, nullptr));
// Split into new comms (all of the same color)
std::vector<ncclComm_t> subComms(numDevices);
NCCLCHECK(ncclGroupStart());
for (int localRank = 0; localRank < numDevices; localRank++)
NCCLCHECK(ncclCommSplit(comms[localRank], 0, localRank, &subComms[localRank], NULL));
NCCLCHECK(ncclGroupEnd());
// Validate results
for (int i = 0; i < numDevices; i++) {
int originalRank, originalNRank;
NCCLCHECK(ncclCommUserRank(comms[i], &originalRank));
NCCLCHECK(ncclCommCount(comms[i], &originalNRank));
int subCommRank, subCommNRank;
NCCLCHECK(ncclCommUserRank(subComms[i], &subCommRank));
NCCLCHECK(ncclCommCount(subComms[i], &subCommNRank));
ASSERT_EQ(originalRank, subCommRank);
ASSERT_EQ(originalNRank, subCommNRank);
}
// Clean up comms
for (auto& subComm : subComms)
NCCLCHECK(ncclCommDestroy(subComm));
for (auto& comm : comms)
NCCLCHECK(ncclCommDestroy(comm));
}
/**
* \brief Creates a communicator for each device and reduces them into (numDevices / 2) ranks.
* ******************************************************************************************/
TEST(Standalone, SplitComms_Reduce)
{
// Check for multi-gpu
int numDevices;
HIPCALL(hipGetDeviceCount(&numDevices));
if (numDevices < 2) {
GTEST_SKIP() << "This test requires at least 2 devices.";
}
// Initialize the original comms
std::vector<ncclComm_t> comms(numDevices);
NCCLCHECK(ncclCommInitAll(comms.data(), numDevices, nullptr));
// Split into new comms
int numReducedRanks = numDevices / 2;
std::vector<ncclComm_t> subComms(numDevices);
NCCLCHECK(ncclGroupStart());
for (int localRank = 0; localRank < numDevices; localRank++)
NCCLCHECK(ncclCommSplit(comms[localRank],
localRank < numReducedRanks ? 0 : NCCL_SPLIT_NOCOLOR,
localRank, &subComms[localRank], NULL));
NCCLCHECK(ncclGroupEnd());
// Validate results
for (int i = 0; i < numDevices; i++) {
int originalRank, originalNRank;
NCCLCHECK(ncclCommUserRank(comms[i], &originalRank));
NCCLCHECK(ncclCommCount(comms[i], &originalNRank));
if (i < numReducedRanks) {
int subCommRank, subCommNRank;
NCCLCHECK(ncclCommUserRank(subComms[i], &subCommRank));
NCCLCHECK(ncclCommCount(subComms[i], &subCommNRank));
ASSERT_EQ(originalRank, subCommRank);
ASSERT_EQ(subCommNRank, numReducedRanks);
} else {
ASSERT_EQ(subComms[i], nullptr);
}
}
// Cleanup comms
for (auto& subComm : subComms)
NCCLCHECK(ncclCommDestroy(subComm));
for (auto& comm : comms)
NCCLCHECK(ncclCommDestroy(comm));
}
/**
* \brief Verify there is no regression in timing for each protocol [LL, LL128, Simple]
* ******************************************************************************************/
TEST(Standalone, RegressionTiming)
{
// timing
using namespace std::chrono;
using Clock = std::chrono::high_resolution_clock;
int usElapsed, numIterations = 20, numWarmups = 5;
// Check for 2 GPUs
int numGpus;
HIPCALL(hipGetDeviceCount(&numGpus));
if (numGpus < 2) {
GTEST_SKIP() << "This test requires at least 2 devices.";
}
hipDeviceProp_t devProp;
HIPCALL(hipGetDeviceProperties(&devProp, 0));
// Initialize RCCL
constexpr int numRanks = 2;
std::vector<ncclComm_t> comms(numRanks);
std::vector<int*> gpuInput(numRanks);
std::vector<int*> gpuOutput(numRanks);
std::vector<hipStream_t> stream(numRanks);
char *proto = std::getenv("NCCL_PROTO");
const char* protocolList[3] = {"LL", "LL128", "Simple"};
for (auto p : protocolList)
{
usElapsed = 0;
if(strncmp("gfx12",devProp.gcnArchName,5) == 0) {
setenv("NCCL_PROTO", "Simple", 1);
} else {
setenv("NCCL_PROTO", p, 1);
}
NCCLCHECK(ncclCommInitAll(comms.data(), numRanks, nullptr));
// Prepare CPU data arrays
int N = 1250;
std::vector<int> cpuInput(N);
std::vector<int> cpuExpected(N);
for (int i = 0; i < N; i++) {
cpuInput[i] = i;
cpuExpected[i] = 2 * i;
}
// Prepare GPU data arrays
for (int rank = 0; rank < numRanks; rank++) {
HIPCALL(hipSetDevice(rank));
HIPCALL(hipStreamCreate(&stream[rank]));
HIPCALL(hipMalloc((void**)&gpuInput[rank], N * sizeof(int)));
HIPCALL(hipMalloc((void**)&gpuOutput[rank], N * sizeof(int)));
HIPCALL(hipMemcpy(gpuInput[rank], cpuInput.data(), N * sizeof(int), hipMemcpyHostToDevice));
HIPCALL(hipMemset(gpuOutput[rank], 0, N * sizeof(int)));
HIPCALL(hipDeviceSynchronize());
}
for (int iter = -numWarmups; iter < numIterations; iter++) {
for (int rank = 0; rank < numRanks; rank++) {
HIPCALL(hipSetDevice(rank));
HIPCALL(hipMemset(gpuOutput[rank], 0, N * sizeof(int)));
HIPCALL(hipDeviceSynchronize());
}
// Initiate the allreduce
NCCLCHECK(ncclGroupStart());
for (int rank = 0; rank < numRanks; rank++)
NCCLCHECK(ncclAllReduce(gpuInput[rank], gpuOutput[rank], N, ncclInt, ncclSum, comms[rank], stream[rank]));
ncclResult_t res = ncclGroupEnd();
if (res != ncclSuccess) continue;
const auto start = Clock::now();
// Wait for completion
for (int rank = 0; rank < numRanks; rank++) {
HIPCALL(hipStreamSynchronize(stream[rank]));
}
if (iter >= 0)
usElapsed += duration_cast<microseconds>(Clock::now() - start).count();
// Check results
std::vector<int> cpuOutput(N);
for (int rank = 0; rank < numRanks; rank++) {
HIPCALL(hipMemcpy(cpuOutput.data(), gpuOutput[rank], N * sizeof(int), hipMemcpyDeviceToHost));
HIPCALL(hipDeviceSynchronize());
for (int i = 0; i < N; i++)
ASSERT_EQ(cpuOutput[i], cpuExpected[i]);
}
}
EXPECT_LT(usElapsed/(double)numIterations, 5000);
printf("[ INFO ] protocol: %s, average runtime: %f microseconds\n", p, usElapsed/(double)numIterations);
// Release resources
for (int rank = 0; rank < numRanks; rank++){
HIPCALL(hipFree(gpuInput[rank]));
HIPCALL(hipFree(gpuOutput[rank]));
HIPCALL(hipStreamDestroy(stream[rank]));
NCCLCHECK(ncclCommDestroy(comms[rank]));
}
}
if (proto)
setenv("NCCL_PROTO", proto, 1);
else
unsetenv("NCCL_PROTO");
}
/**
* \brief Verify rccl generic kernel stack size for each gfx architecture is less than the
* expected MAX_STACK_SIZE.
* ******************************************************************************************/
TEST(Standalone, StackSize) {
const char* mainKernel = "ncclDevKernel";
// Look for the .co files
std::vector<std::string> coFileList = splitString(executeCommand("find ../ -type f -name \"librccl*.co\""), '\n');
// Check if the .co files exist in the build directory
if (coFileList.empty())
GTEST_SKIP() << "Skipping... Could not found required files in the build directory.";
for (const auto& file : coFileList) {
// Store the output in a list
std::string cmd = std::string(ROCM_PATH) + "/llvm/bin/llvm-readelf --notes " + file;
std::vector<std::string> metadata = splitString(executeCommand(cmd.c_str()), '\n');
// Skip if llvm is not installed
if (metadata.empty())
GTEST_SKIP() << "Skipping... llvm is not found.";
// Parse metadata from file and store it for each arch
ArchInfo archInfo = parseMetadata(metadata);
// iterate over each archs kernels
for (const auto& kernel : archInfo.kernels) {
if (kernel.name.find(mainKernel) != std::string::npos) {
// Kernel stack size should be less than or equal to the maxStackSize value
printf("[ INFO ] Arch: %s Kernel: %s Size: %d\n", archInfo.archName.c_str(), kernel.name.c_str(), kernel.privateSegmentFixedSize);
EXPECT_LE(kernel.privateSegmentFixedSize, archInfo.archName == "gfx90a" ? MAX_STACK_SIZE_gfx90a : MAX_STACK_SIZE);
}
}
}
}
/**
* \brief Verify the device associated with communicator in both single and multi-device scenarios
* ******************************************************************************************/
TEST(Standalone, CommCuDevice_Check)
{
int numDevices;
HIPCALL(hipGetDeviceCount(&numDevices));
if (numDevices < 1) {
GTEST_SKIP() << "No devices available.";
}
// Test single comm initialization
ncclComm_t comm;
ncclUniqueId id;
NCCLCHECK(ncclGetUniqueId(&id));
HIPCALL(hipSetDevice(0));
NCCLCHECK(ncclCommInitRank(&comm, 1, id, 0));
// Verify device assignment
int device;
NCCLCHECK(ncclCommCuDevice(comm, &device));
ASSERT_EQ(device, 0);
NCCLCHECK(ncclCommDestroy(comm));
// Test multi-device scenario if available
if (numDevices > 1) {
std::vector<ncclComm_t> comms(numDevices);
// Initialize all communicators at once
NCCLCHECK(ncclCommInitAll(comms.data(), numDevices, nullptr));
// Verify device assignments
for (int i = 0; i < numDevices; i++) {
int assignedDevice;
NCCLCHECK(ncclCommCuDevice(comms[i], &assignedDevice));
ASSERT_EQ(assignedDevice, i);
}
// Clean up
for (int i = 0; i < numDevices; i++) {
NCCLCHECK(ncclCommDestroy(comms[i]));
}
}
}
/**
* \brief verifies that ncclCommUserRank correctly fails when provided with an invalid (null) communicator handle
* ******************************************************************************************/
TEST(Standalone, SplitComms_RankCheck_Basic_Failure) {
// Check for multi-gpu
int numDevices;
HIPCALL(hipGetDeviceCount(&numDevices));
if (numDevices < 2) {
GTEST_SKIP() << "This test requires at least 2 devices.";
}
// Initialize the original comms
std::vector<ncclComm_t> comms(numDevices);
NCCLCHECK(ncclCommInitAll(comms.data(), numDevices, nullptr));
// Create an invalid comm handle that will cause a failure
ncclComm_t invalidComm = nullptr;
// This NCCL_CHECK will fail because we're trying to query rank from a null communicator
int rank;
NCCLCHECK(ncclCommUserRank(invalidComm, &rank));
// Clean up comms
for (auto& comm : comms)
NCCLCHECK(ncclCommDestroy(comm));
}
}