Files
rocm-systems/test/AllReduceTests.cpp
T
2025-04-23 20:47:43 -07:00

253 lines
9.9 KiB
C++

/*************************************************************************
* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
#include "CallCollectiveForked.hpp"
namespace RcclUnitTesting
{
TEST(AllReduce, OutOfPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat8e4m3, ncclFloat8e5m2};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {393216, 384};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {false};
std::vector<bool> const useHipGraphList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements,
inPlaceList, managedMemList, useHipGraphList);
testBed.Finalize();
}
TEST(AllReduce, OutOfPlaceGraph)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
std::vector<ncclDataType_t> const dataTypes = {ncclFloat16, ncclFloat64, ncclFloat8e4m3, ncclFloat8e5m2};
std::vector<ncclRedOp_t> const redOps = {ncclMin};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {12888};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {false};
std::vector<bool> const useHipGraphList = {true};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements,
inPlaceList, managedMemList, useHipGraphList);
testBed.Finalize();
}
TEST(AllReduce, InPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
std::vector<ncclDataType_t> const dataTypes = {ncclInt32, ncclInt8};
std::vector<ncclRedOp_t> const redOps = {ncclProd};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {384};
std::vector<bool> const inPlaceList = {true};
std::vector<bool> const managedMemList = {false};
std::vector<bool> const useHipGraphList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements,
inPlaceList, managedMemList, useHipGraphList);
testBed.Finalize();
}
TEST(AllReduce, InPlaceGraph)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
std::vector<ncclDataType_t> const dataTypes = {ncclInt32, ncclFloat8e4m3, ncclFloat8e5m2};
std::vector<ncclRedOp_t> const redOps = {ncclMax};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {393216, 12888, 384};
std::vector<bool> const inPlaceList = {true};
std::vector<bool> const managedMemList = {false};
std::vector<bool> const useHipGraphList = {true};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements,
inPlaceList, managedMemList, useHipGraphList);
testBed.Finalize();
}
TEST(AllReduce, ManagedMem)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
std::vector<ncclDataType_t> const dataTypes = {ncclUint8, ncclUint64};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {2500};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {true};
std::vector<bool> const useHipGraphList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements,
inPlaceList, managedMemList, useHipGraphList);
testBed.Finalize();
}
TEST(AllReduce, Channels)
{
TestBed testBed;
if(testBed.ev.maxGpus >= 8) {
if(testBed.ev.isGfx94) {
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
std::vector<ncclDataType_t> const dataTypes = {ncclBfloat16};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {64 * 1024 * 1024, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {false};
std::vector<bool> const useHipGraphList = {false, true};
std::vector<const char *> const channelList = {"84", "112"};
bool const enableSweep = false;
for (auto channel : channelList) {
setenv("NCCL_MIN_NCHANNELS", channel, 1);
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements,
inPlaceList, managedMemList, useHipGraphList, enableSweep);
testBed.Finalize();
unsetenv("NCCL_MIN_NCHANNELS");
}
}
}
}
TEST(AllReduce, ManagedMemGraph)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
std::vector<ncclDataType_t> const dataTypes = {ncclFloat64, ncclBfloat16};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {4314};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {true};
std::vector<bool> const useHipGraphList = {true};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements,
inPlaceList, managedMemList, useHipGraphList);
testBed.Finalize();
}
// This tests using custom pre-mult scalars reductions
TEST(AllReduce, PreMultScalar)
{
TestBed testBed;
// Configuration
ncclFunc_t const funcType = ncclCollAllReduce;
std::vector<ncclDataType_t> const& dataTypes = {ncclFloat32};
ncclRedOp_t const redOp = ncclSum;
std::vector<int> const numElements = {384 * 1024, 384 * 32, 384};
bool const inPlace = false;
bool const useManagedMem = false;
OptionalColArgs options;
// Terminate the test as soon as first failure occurs
bool isCorrect = true;
for (int totalRanks : testBed.ev.GetNumGpusList())
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
{
int const numProcesses = isMultiProcess ? totalRanks : 1;
const std::vector<int>& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder();
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder));
for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
{
ncclDataType_t const dataType = dataTypes[dataIdx];
// Set scalars per rank
PtrUnion scalarsPerRank;
scalarsPerRank.AllocateCpuMem(totalRanks * DataTypeToBytes(dataType));
for (int i = 0; i < totalRanks; i++)
{
double F = i;
scalarsPerRank.Set(dataType, i, i, F);
}
int const numBytes = totalRanks * DataTypeToBytes(dataType);
memcpy(options.scalarTransport.ptr, scalarsPerRank.ptr, numBytes);
// Test various scalar residence modes
for (int scalarMode = 0; scalarMode <= 1 && isCorrect; ++scalarMode)
{
if (testBed.ev.showNames)
INFO("%s %d-ranks AllReduce (custom-scalar Mode %d %s)\n",
isMultiProcess ? "MP" : "SP",
totalRanks, scalarMode, ncclDataTypeNames[dataType]);
for (int i = 0; i < numElements.size() && isCorrect; ++i)
{
options.scalarMode = scalarMode;
options.redOp = redOp;
testBed.SetCollectiveArgs(funcType, dataType,
numElements[i], numElements[i],
options);
// For performance, only allocate and prepare data on largest size
if (i == 0)
{
testBed.AllocateMem(inPlace, useManagedMem);
testBed.PrepareData();
}
testBed.ExecuteCollectives();
testBed.ValidateResults(isCorrect);
}
testBed.DeallocateMem();
}
}
testBed.DestroyComms();
}
testBed.Finalize();
}
TEST(AllReduce, UserBufferRegistration)
{
const int nranks = 8;
size_t count = 2048;
std::vector<int> sendBuff(count, 0);
std::vector<int> recvBuff(count, 0);
std::vector<int> expected(count, 0);
for (int i = 0; i < count; ++i){
sendBuff[i] = i;
expected[i] = i * nranks;
}
callCollectiveForked(nranks, ncclCollAllReduce, sendBuff, recvBuff, expected);
}
TEST(AllReduce, ManagedMemUserBufferRegistration)
{
const int nranks = 8;
size_t count = 2048;
std::vector<int> sendBuff(count, 0);
std::vector<int> recvBuff(count, 0);
std::vector<int> expected(count, 0);
const bool use_managed_mem = true;
for (int i = 0; i < count; ++i){
sendBuff[i] = i;
expected[i] = i * nranks;
}
callCollectiveForked(nranks, ncclCollAllReduce, sendBuff, recvBuff, expected, use_managed_mem);
}
}