5580cb7574
Signed-off-by: Deeksha Goplani <deeksha.goplani@amd.com>
[ROCm/rccl commit: 2486838465]
374 satır
13 KiB
C++
374 satır
13 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#include <gtest/gtest.h>
|
|
#include <rccl/rccl.h>
|
|
|
|
#include "TestBed.hpp"
|
|
#include "StandaloneUtils.hpp"
|
|
|
|
namespace RcclUnitTesting
|
|
{
|
|
/**
|
|
* \brief Verify that each device is assigned to the right rank using ncclCommSplit API.
|
|
* ******************************************************************************************/
|
|
TEST(Standalone, SplitComms_RankCheck)
|
|
{
|
|
// Check for multi-gpu
|
|
int numDevices;
|
|
HIPCALL(hipGetDeviceCount(&numDevices));
|
|
if (numDevices < 2) {
|
|
GTEST_SKIP() << "This test requires at least 2 devices.";
|
|
}
|
|
|
|
// Initialize the original comms
|
|
std::vector<ncclComm_t> comms(numDevices);
|
|
NCCLCHECK(ncclCommInitAll(comms.data(), numDevices, nullptr));
|
|
|
|
// Split into new comms (round-robin)
|
|
std::vector<ncclComm_t> subComms(numDevices);
|
|
int numSubComms = 2;
|
|
|
|
std::map<int, int> mapCounter;
|
|
NCCLCHECK(ncclGroupStart());
|
|
for (int localRank = 0; localRank < numDevices; localRank++) {
|
|
NCCLCHECK(ncclCommSplit(comms[localRank], localRank % numSubComms, localRank, &subComms[localRank], NULL));
|
|
mapCounter[localRank % numSubComms]++;
|
|
}
|
|
NCCLCHECK(ncclGroupEnd());
|
|
|
|
// Check that new comms have correct subranks / ranks
|
|
for (int i = 0; i < numDevices; i++) {
|
|
int subCommRank, subCommNRank;
|
|
NCCLCHECK(ncclCommUserRank(subComms[i], &subCommRank));
|
|
NCCLCHECK(ncclCommCount(subComms[i], &subCommNRank));
|
|
|
|
ASSERT_EQ(subCommRank, i / numSubComms);
|
|
ASSERT_EQ(subCommNRank, mapCounter[i % numSubComms]);
|
|
}
|
|
|
|
// Clean up comms
|
|
for (auto& subComm : subComms)
|
|
NCCLCHECK(ncclCommDestroy(subComm));
|
|
for (auto& comm : comms)
|
|
NCCLCHECK(ncclCommDestroy(comm));
|
|
}
|
|
|
|
/**
|
|
* \brief Creates a communicator for each device and gathers them all in one rank.
|
|
* ******************************************************************************************/
|
|
TEST(Standalone, SplitComms_OneColor)
|
|
{
|
|
// Check for multi-gpu
|
|
int numDevices;
|
|
HIPCALL(hipGetDeviceCount(&numDevices));
|
|
if (numDevices < 2) {
|
|
GTEST_SKIP() << "This test requires at least 2 devices.";
|
|
}
|
|
|
|
// Initialize the original comms
|
|
std::vector<ncclComm_t> comms(numDevices);
|
|
NCCLCHECK(ncclCommInitAll(comms.data(), numDevices, nullptr));
|
|
|
|
// Split into new comms (all of the same color)
|
|
std::vector<ncclComm_t> subComms(numDevices);
|
|
NCCLCHECK(ncclGroupStart());
|
|
for (int localRank = 0; localRank < numDevices; localRank++)
|
|
NCCLCHECK(ncclCommSplit(comms[localRank], 0, localRank, &subComms[localRank], NULL));
|
|
NCCLCHECK(ncclGroupEnd());
|
|
|
|
// Validate results
|
|
for (int i = 0; i < numDevices; i++) {
|
|
int originalRank, originalNRank;
|
|
NCCLCHECK(ncclCommUserRank(comms[i], &originalRank));
|
|
NCCLCHECK(ncclCommCount(comms[i], &originalNRank));
|
|
|
|
int subCommRank, subCommNRank;
|
|
NCCLCHECK(ncclCommUserRank(subComms[i], &subCommRank));
|
|
NCCLCHECK(ncclCommCount(subComms[i], &subCommNRank));
|
|
|
|
ASSERT_EQ(originalRank, subCommRank);
|
|
ASSERT_EQ(originalNRank, subCommNRank);
|
|
}
|
|
|
|
// Clean up comms
|
|
for (auto& subComm : subComms)
|
|
NCCLCHECK(ncclCommDestroy(subComm));
|
|
for (auto& comm : comms)
|
|
NCCLCHECK(ncclCommDestroy(comm));
|
|
}
|
|
|
|
/**
|
|
* \brief Creates a communicator for each device and reduces them into (numDevices / 2) ranks.
|
|
* ******************************************************************************************/
|
|
TEST(Standalone, SplitComms_Reduce)
|
|
{
|
|
// Check for multi-gpu
|
|
int numDevices;
|
|
HIPCALL(hipGetDeviceCount(&numDevices));
|
|
if (numDevices < 2) {
|
|
GTEST_SKIP() << "This test requires at least 2 devices.";
|
|
}
|
|
|
|
// Initialize the original comms
|
|
std::vector<ncclComm_t> comms(numDevices);
|
|
NCCLCHECK(ncclCommInitAll(comms.data(), numDevices, nullptr));
|
|
|
|
// Split into new comms
|
|
int numReducedRanks = numDevices / 2;
|
|
std::vector<ncclComm_t> subComms(numDevices);
|
|
NCCLCHECK(ncclGroupStart());
|
|
for (int localRank = 0; localRank < numDevices; localRank++)
|
|
NCCLCHECK(ncclCommSplit(comms[localRank],
|
|
localRank < numReducedRanks ? 0 : NCCL_SPLIT_NOCOLOR,
|
|
localRank, &subComms[localRank], NULL));
|
|
NCCLCHECK(ncclGroupEnd());
|
|
|
|
// Validate results
|
|
for (int i = 0; i < numDevices; i++) {
|
|
int originalRank, originalNRank;
|
|
NCCLCHECK(ncclCommUserRank(comms[i], &originalRank));
|
|
NCCLCHECK(ncclCommCount(comms[i], &originalNRank));
|
|
|
|
if (i < numReducedRanks) {
|
|
int subCommRank, subCommNRank;
|
|
NCCLCHECK(ncclCommUserRank(subComms[i], &subCommRank));
|
|
NCCLCHECK(ncclCommCount(subComms[i], &subCommNRank));
|
|
|
|
ASSERT_EQ(originalRank, subCommRank);
|
|
ASSERT_EQ(subCommNRank, numReducedRanks);
|
|
} else {
|
|
ASSERT_EQ(subComms[i], nullptr);
|
|
}
|
|
}
|
|
|
|
// Cleanup comms
|
|
for (auto& subComm : subComms)
|
|
NCCLCHECK(ncclCommDestroy(subComm));
|
|
for (auto& comm : comms)
|
|
NCCLCHECK(ncclCommDestroy(comm));
|
|
}
|
|
|
|
/**
|
|
* \brief Verify there is no regression in timing for each protocol [LL, LL128, Simple]
|
|
* ******************************************************************************************/
|
|
TEST(Standalone, RegressionTiming)
|
|
{
|
|
// timing
|
|
using namespace std::chrono;
|
|
using Clock = std::chrono::high_resolution_clock;
|
|
int usElapsed, numIterations = 20, numWarmups = 5;
|
|
|
|
// Check for 2 GPUs
|
|
int numGpus;
|
|
HIPCALL(hipGetDeviceCount(&numGpus));
|
|
if (numGpus < 2) {
|
|
GTEST_SKIP() << "This test requires at least 2 devices.";
|
|
}
|
|
hipDeviceProp_t devProp;
|
|
HIPCALL(hipGetDeviceProperties(&devProp, 0));
|
|
// Initialize RCCL
|
|
constexpr int numRanks = 2;
|
|
std::vector<ncclComm_t> comms(numRanks);
|
|
std::vector<int*> gpuInput(numRanks);
|
|
std::vector<int*> gpuOutput(numRanks);
|
|
std::vector<hipStream_t> stream(numRanks);
|
|
|
|
char *proto = std::getenv("NCCL_PROTO");
|
|
const char* protocolList[3] = {"LL", "LL128", "Simple"};
|
|
|
|
for (auto p : protocolList)
|
|
{
|
|
usElapsed = 0;
|
|
if(strncmp("gfx12",devProp.gcnArchName,5) == 0) {
|
|
setenv("NCCL_PROTO", "Simple", 1);
|
|
} else {
|
|
setenv("NCCL_PROTO", p, 1);
|
|
}
|
|
|
|
NCCLCHECK(ncclCommInitAll(comms.data(), numRanks, nullptr));
|
|
|
|
// Prepare CPU data arrays
|
|
int N = 1250;
|
|
std::vector<int> cpuInput(N);
|
|
std::vector<int> cpuExpected(N);
|
|
for (int i = 0; i < N; i++) {
|
|
cpuInput[i] = i;
|
|
cpuExpected[i] = 2 * i;
|
|
}
|
|
|
|
// Prepare GPU data arrays
|
|
for (int rank = 0; rank < numRanks; rank++) {
|
|
HIPCALL(hipSetDevice(rank));
|
|
HIPCALL(hipStreamCreate(&stream[rank]));
|
|
HIPCALL(hipMalloc((void**)&gpuInput[rank], N * sizeof(int)));
|
|
HIPCALL(hipMalloc((void**)&gpuOutput[rank], N * sizeof(int)));
|
|
HIPCALL(hipMemcpy(gpuInput[rank], cpuInput.data(), N * sizeof(int), hipMemcpyHostToDevice));
|
|
HIPCALL(hipMemset(gpuOutput[rank], 0, N * sizeof(int)));
|
|
HIPCALL(hipDeviceSynchronize());
|
|
}
|
|
|
|
for (int iter = -numWarmups; iter < numIterations; iter++) {
|
|
|
|
for (int rank = 0; rank < numRanks; rank++) {
|
|
HIPCALL(hipSetDevice(rank));
|
|
HIPCALL(hipMemset(gpuOutput[rank], 0, N * sizeof(int)));
|
|
HIPCALL(hipDeviceSynchronize());
|
|
}
|
|
|
|
// Initiate the allreduce
|
|
NCCLCHECK(ncclGroupStart());
|
|
for (int rank = 0; rank < numRanks; rank++)
|
|
NCCLCHECK(ncclAllReduce(gpuInput[rank], gpuOutput[rank], N, ncclInt, ncclSum, comms[rank], stream[rank]));
|
|
ncclResult_t res = ncclGroupEnd();
|
|
|
|
if (res != ncclSuccess) continue;
|
|
|
|
const auto start = Clock::now();
|
|
|
|
// Wait for completion
|
|
for (int rank = 0; rank < numRanks; rank++) {
|
|
HIPCALL(hipStreamSynchronize(stream[rank]));
|
|
}
|
|
|
|
if (iter >= 0)
|
|
usElapsed += duration_cast<microseconds>(Clock::now() - start).count();
|
|
|
|
// Check results
|
|
std::vector<int> cpuOutput(N);
|
|
for (int rank = 0; rank < numRanks; rank++) {
|
|
HIPCALL(hipMemcpy(cpuOutput.data(), gpuOutput[rank], N * sizeof(int), hipMemcpyDeviceToHost));
|
|
HIPCALL(hipDeviceSynchronize());
|
|
for (int i = 0; i < N; i++)
|
|
ASSERT_EQ(cpuOutput[i], cpuExpected[i]);
|
|
}
|
|
}
|
|
|
|
EXPECT_LT(usElapsed/(double)numIterations, 5000);
|
|
printf("[ INFO ] protocol: %s, average runtime: %f microseconds\n", p, usElapsed/(double)numIterations);
|
|
// Release resources
|
|
for (int rank = 0; rank < numRanks; rank++){
|
|
HIPCALL(hipFree(gpuInput[rank]));
|
|
HIPCALL(hipFree(gpuOutput[rank]));
|
|
HIPCALL(hipStreamDestroy(stream[rank]));
|
|
NCCLCHECK(ncclCommDestroy(comms[rank]));
|
|
}
|
|
}
|
|
if (proto)
|
|
setenv("NCCL_PROTO", proto, 1);
|
|
else
|
|
unsetenv("NCCL_PROTO");
|
|
}
|
|
|
|
/**
|
|
* \brief Verify rccl generic kernel stack size for each gfx architecture is less than the
|
|
* expected MAX_STACK_SIZE.
|
|
* ******************************************************************************************/
|
|
TEST(Standalone, StackSize) {
|
|
const char* mainKernel = "ncclDevKernel";
|
|
|
|
// Look for the .co files
|
|
std::vector<std::string> coFileList = splitString(executeCommand("find ../ -type f -name \"librccl*.co\""), '\n');
|
|
|
|
// Check if the .co files exist in the build directory
|
|
if (coFileList.empty())
|
|
GTEST_SKIP() << "Skipping... Could not found required files in the build directory.";
|
|
|
|
for (const auto& file : coFileList) {
|
|
// Store the output in a list
|
|
std::string cmd = std::string(ROCM_PATH) + "/llvm/bin/llvm-readelf --notes " + file;
|
|
std::vector<std::string> metadata = splitString(executeCommand(cmd.c_str()), '\n');
|
|
|
|
// Skip if llvm is not installed
|
|
if (metadata.empty())
|
|
GTEST_SKIP() << "Skipping... llvm is not found.";
|
|
|
|
// Parse metadata from file and store it for each arch
|
|
ArchInfo archInfo = parseMetadata(metadata);
|
|
|
|
// iterate over each archs kernels
|
|
for (const auto& kernel : archInfo.kernels) {
|
|
if (kernel.name.find(mainKernel) != std::string::npos) {
|
|
// Kernel stack size should be less than or equal to the maxStackSize value
|
|
printf("[ INFO ] Arch: %s Kernel: %s Size: %d\n", archInfo.archName.c_str(), kernel.name.c_str(), kernel.privateSegmentFixedSize);
|
|
EXPECT_LE(kernel.privateSegmentFixedSize, archInfo.archName == "gfx90a" ? MAX_STACK_SIZE_gfx90a : MAX_STACK_SIZE);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
/**
|
|
* \brief Verify the device associated with communicator in both single and multi-device scenarios
|
|
* ******************************************************************************************/
|
|
TEST(Standalone, CommCuDevice_Check)
|
|
{
|
|
int numDevices;
|
|
HIPCALL(hipGetDeviceCount(&numDevices));
|
|
if (numDevices < 1) {
|
|
GTEST_SKIP() << "No devices available.";
|
|
}
|
|
|
|
// Test single comm initialization
|
|
ncclComm_t comm;
|
|
ncclUniqueId id;
|
|
NCCLCHECK(ncclGetUniqueId(&id));
|
|
HIPCALL(hipSetDevice(0));
|
|
NCCLCHECK(ncclCommInitRank(&comm, 1, id, 0));
|
|
|
|
// Verify device assignment
|
|
int device;
|
|
NCCLCHECK(ncclCommCuDevice(comm, &device));
|
|
ASSERT_EQ(device, 0);
|
|
NCCLCHECK(ncclCommDestroy(comm));
|
|
|
|
// Test multi-device scenario if available
|
|
if (numDevices > 1) {
|
|
std::vector<ncclComm_t> comms(numDevices);
|
|
|
|
// Initialize all communicators at once
|
|
NCCLCHECK(ncclCommInitAll(comms.data(), numDevices, nullptr));
|
|
|
|
// Verify device assignments
|
|
for (int i = 0; i < numDevices; i++) {
|
|
int assignedDevice;
|
|
NCCLCHECK(ncclCommCuDevice(comms[i], &assignedDevice));
|
|
ASSERT_EQ(assignedDevice, i);
|
|
}
|
|
|
|
// Clean up
|
|
for (int i = 0; i < numDevices; i++) {
|
|
NCCLCHECK(ncclCommDestroy(comms[i]));
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* \brief verifies that ncclCommUserRank correctly fails when provided with an invalid (null) communicator handle
|
|
* ******************************************************************************************/
|
|
TEST(Standalone, SplitComms_RankCheck_Basic_Failure) {
|
|
// Check for multi-gpu
|
|
int numDevices;
|
|
HIPCALL(hipGetDeviceCount(&numDevices));
|
|
if (numDevices < 2) {
|
|
GTEST_SKIP() << "This test requires at least 2 devices.";
|
|
}
|
|
|
|
// Initialize the original comms
|
|
std::vector<ncclComm_t> comms(numDevices);
|
|
NCCLCHECK(ncclCommInitAll(comms.data(), numDevices, nullptr));
|
|
|
|
// Create an invalid comm handle that will cause a failure
|
|
ncclComm_t invalidComm = nullptr;
|
|
|
|
// This NCCL_CHECK will fail because we're trying to query rank from a null communicator
|
|
int rank;
|
|
NCCLCHECK(ncclCommUserRank(invalidComm, &rank));
|
|
|
|
// Clean up comms
|
|
for (auto& comm : comms)
|
|
NCCLCHECK(ncclCommDestroy(comm));
|
|
}
|
|
}
|