Unit test refactor (#500)
Refactoring and consolidating single-process / multi-process unit testing
This commit is contained in:
@@ -21,7 +21,7 @@ def runTestCommand (platform, project, gfilter)
|
||||
def command = """#!/usr/bin/env bash
|
||||
set -x
|
||||
cd ${project.paths.project_build_prefix}/build/release/test
|
||||
${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
|
||||
${sudo} UT_SHOW_NAMES=1 HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
|
||||
"""
|
||||
|
||||
platform.runCommand(this, command)
|
||||
|
||||
@@ -9,12 +9,12 @@ import com.amd.project.*
|
||||
import com.amd.docker.*
|
||||
import java.nio.file.Path
|
||||
|
||||
def runCI =
|
||||
def runCI =
|
||||
{
|
||||
nodeDetails, jobName->
|
||||
|
||||
def prj = new rocProject('rccl', 'PreCheckin')
|
||||
|
||||
|
||||
prj.timeout.test = 1440
|
||||
prj.paths.build_command = './install.sh -t '
|
||||
|
||||
@@ -32,25 +32,25 @@ def runCI =
|
||||
commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
|
||||
commonGroovy.runCompileCommand(platform, project, jobName)
|
||||
}
|
||||
|
||||
|
||||
def testCommand =
|
||||
{
|
||||
platform, project->
|
||||
|
||||
commonGroovy.runTestCommand(platform, project, "*sum_float32*")
|
||||
commonGroovy.runTestCommand(platform, project, "*")
|
||||
}
|
||||
|
||||
def packageCommand =
|
||||
{
|
||||
platform, project->
|
||||
|
||||
|
||||
commonGroovy.runPackageCommand(platform, project, jobName)
|
||||
}
|
||||
|
||||
buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
|
||||
}
|
||||
|
||||
ci: {
|
||||
ci: {
|
||||
String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
|
||||
|
||||
def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
|
||||
@@ -58,17 +58,17 @@ ci: {
|
||||
propertyList = auxiliary.appendPropertyList(propertyList)
|
||||
|
||||
def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([sles15sp1:['4gfx906'],centos8:['8gfx908'],centos7:['8gfx906'],ubuntu18:['4gfx906', '4gfx908']])]
|
||||
|
||||
|
||||
jobNameList = auxiliary.appendJobNameList(jobNameList)
|
||||
|
||||
propertyList.each
|
||||
|
||||
propertyList.each
|
||||
{
|
||||
jobName, property->
|
||||
if (urlJobName == jobName)
|
||||
properties(auxiliary.addCommonProperties(property))
|
||||
}
|
||||
|
||||
jobNameList.each
|
||||
jobNameList.each
|
||||
{
|
||||
jobName, nodeDetails->
|
||||
if (urlJobName == jobName)
|
||||
|
||||
+1
-1
@@ -229,7 +229,7 @@ if ($run_tests); then
|
||||
if ($run_tests_all); then
|
||||
./test/UnitTests
|
||||
else
|
||||
./test/UnitTests --gtest_filter="BroadcastCorrectnessSweep*:*float32*"
|
||||
./test/UnitTests --gtest_filter="AllReduce.*"
|
||||
fi
|
||||
else
|
||||
echo "Unit tests have not been built yet; please re-run script with -t to build unit tests."
|
||||
|
||||
@@ -96,6 +96,7 @@ void CliqueManager::CleanUp()
|
||||
if (m_cliqueMode == CLIQUE_SINGLE_NODE)
|
||||
{
|
||||
// Release caches
|
||||
INFO(NCCL_COLL, "Rank %d deleting IPC caches", m_rank);
|
||||
if (m_ipcHandleSendCache) delete m_ipcHandleSendCache;
|
||||
if (m_ipcHandleRecvCache) delete m_ipcHandleRecvCache;
|
||||
|
||||
@@ -494,18 +495,24 @@ ncclResult_t CliqueManager::CheckCacheForPtr(void* devPtr,
|
||||
uint64_t realAddr = (uint64_t)devPtr;
|
||||
handlePair->second = realAddr - baseAddr;
|
||||
|
||||
CUDACHECK(hipIpcGetMemHandle(&handlePair->first, (void*)baseAddr));
|
||||
|
||||
/* Disabling cache until proper deallocation methods are available
|
||||
// IPC handles are only supported for base address pointers
|
||||
NcclIpcHandleSendCache::iterator it = cache->find(baseAddr);
|
||||
|
||||
if (it == cache->end())
|
||||
{
|
||||
INFO(NCCL_COLL, "Rank %d searching IPC handle cache for %p (not found)", rank, devPtr);
|
||||
CUDACHECK(hipIpcGetMemHandle(&handlePair->first, (void*)baseAddr));
|
||||
cache->insert(baseAddr, handlePair->first);
|
||||
}
|
||||
else
|
||||
{
|
||||
INFO(NCCL_COLL, "Rank %d searching IPC handle cache for %p (found!)", rank, devPtr);
|
||||
handlePair->first = (it->second).first;
|
||||
}
|
||||
*/
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -513,10 +520,16 @@ ncclResult_t CliqueManager::CheckCacheForHandle(std::pair<hipIpcMemHandle_t, siz
|
||||
NcclIpcHandleRecvCache* cache,
|
||||
void** ptr)
|
||||
{
|
||||
// Until proper deallocation hooks are implemented, receive cache can not be used
|
||||
// Handles will need to be extract each time
|
||||
void* baseAddr;
|
||||
CUDACHECK(hipIpcOpenMemHandle(&baseAddr, handlePair.first, hipIpcMemLazyEnablePeerAccess));
|
||||
|
||||
/*
|
||||
NcclIpcHandleRecvCache::iterator it = cache->find(handlePair.first);
|
||||
|
||||
// Get base address pointer from cache if it exists
|
||||
void* baseAddr;
|
||||
|
||||
if (it == cache->end())
|
||||
{
|
||||
CUDACHECK(hipIpcOpenMemHandle(&baseAddr, handlePair.first, hipIpcMemLazyEnablePeerAccess));
|
||||
@@ -526,6 +539,7 @@ ncclResult_t CliqueManager::CheckCacheForHandle(std::pair<hipIpcMemHandle_t, siz
|
||||
{
|
||||
baseAddr = (it->second).first;
|
||||
}
|
||||
*/
|
||||
|
||||
// Modify base address pointer with offset
|
||||
uint64_t realAddr = (uint64_t)baseAddr + handlePair.second;
|
||||
|
||||
@@ -79,14 +79,14 @@ public:
|
||||
static ncclResult_t BootstrapRootInit(int pid, unsigned long hash);
|
||||
|
||||
protected:
|
||||
static ncclResult_t CheckCacheForPtr(void* devPtr,
|
||||
NcclIpcHandleSendCache* cache,
|
||||
int rank,
|
||||
std::pair<hipIpcMemHandle_t, size_t>* handlePair);
|
||||
ncclResult_t CheckCacheForPtr(void* devPtr,
|
||||
NcclIpcHandleSendCache* cache,
|
||||
int rank,
|
||||
std::pair<hipIpcMemHandle_t, size_t>* handlePair);
|
||||
|
||||
static ncclResult_t CheckCacheForHandle(std::pair<hipIpcMemHandle_t, size_t> const& handlePair,
|
||||
NcclIpcHandleRecvCache* cache,
|
||||
void** ptr);
|
||||
ncclResult_t CheckCacheForHandle(std::pair<hipIpcMemHandle_t, size_t> const& handlePair,
|
||||
NcclIpcHandleRecvCache* cache,
|
||||
void** ptr);
|
||||
|
||||
int m_rank; // Associated rank
|
||||
int m_numRanks; // Total number of ranks
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(AllGather, InPlace)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllGather};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclInt8, ncclInt32, ncclInt64};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {true};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(AllGather, ManagedMem)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllGather};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclUint8, ncclUint32, ncclUint64};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {true};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(AllGather, OutOfPlace)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllGather};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
#include <cstdlib>
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(AllReduce, Clique)
|
||||
{
|
||||
// Set clique env var prior to TestBed
|
||||
setenv("RCCL_ENABLE_CLIQUE", "1", 1);
|
||||
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
|
||||
std::vector<ncclDataType_t> const dataTypes = testBed.GetAllSupportedDataTypes();
|
||||
std::vector<ncclRedOp_t> const redOps = testBed.GetAllSupportedRedOps();
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false, true};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
|
||||
unsetenv("RCCL_ENABLE_CLIQUE");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,63 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(AllReduce, GroupCall)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
ncclFunc_t const funcType = ncclCollAllReduce;
|
||||
std::vector<ncclDataType_t> const& dataTypes = {ncclFloat};
|
||||
std::vector<ncclRedOp_t> const& redOps = {ncclSum};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
int const root = 0;
|
||||
bool const inPlace = false;
|
||||
bool const useManagedMem = false;
|
||||
int const numCollPerGroup = numElements.size();
|
||||
|
||||
// This tests runs 3 collectives in the same group call
|
||||
bool isCorrect = true;
|
||||
for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
|
||||
for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
|
||||
{
|
||||
// Test either single process all GPUs, or 1 process per GPU
|
||||
int const numProcesses = isMultiProcess ? totalRanks : 1;
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
|
||||
|
||||
for (int redOpIdx = 0; redOpIdx < redOps.size() && isCorrect; ++redOpIdx)
|
||||
for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
|
||||
{
|
||||
if (testBed.ev.showNames)
|
||||
INFO("%s process %2d-ranks AllReduce %d Grouped Calls (%s-%s)\n",
|
||||
isMultiProcess ? "Multi " : "Single",
|
||||
totalRanks, numCollPerGroup,
|
||||
ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]);
|
||||
|
||||
// Run all element sizes in parallel as single group
|
||||
for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx)
|
||||
{
|
||||
testBed.SetCollectiveArgs(funcType,
|
||||
dataTypes[dataIdx],
|
||||
redOps[redOpIdx],
|
||||
root,
|
||||
numElements[collIdx],
|
||||
numElements[collIdx],
|
||||
collIdx);
|
||||
}
|
||||
testBed.AllocateMem(inPlace, useManagedMem);
|
||||
testBed.PrepareData();
|
||||
testBed.ExecuteCollectives();
|
||||
testBed.ValidateResults(isCorrect);
|
||||
testBed.DeallocateMem();
|
||||
}
|
||||
testBed.DestroyComms();
|
||||
}
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(AllReduce, InPlace)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclInt8, ncclInt32, ncclFloat32};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum, ncclProd};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {true};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(AllReduce, ManagedMem)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclUint8, ncclUint64};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum, ncclMax};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {true};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(AllReduce, OutOfPlace)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum, ncclMin};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
// This tests using custom pre-mult scalars reductions
|
||||
TEST(AllReduce, PreMultScalar)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
ncclFunc_t const funcType = ncclCollAllReduce;
|
||||
std::vector<ncclDataType_t> const& dataTypes = {ncclInt32, ncclFloat32, ncclFloat64};
|
||||
ncclRedOp_t const redOp = ncclSum;
|
||||
std::vector<int> const numElements = {1048576, 1024};
|
||||
int const root = 0;
|
||||
bool const inPlace = false;
|
||||
bool const useManagedMem = false;
|
||||
|
||||
// Terminate the test as soon as first failure occurs
|
||||
bool isCorrect = true;
|
||||
for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
|
||||
for (int isMultiProcess = 0; isMultiProcess <= 1; ++isMultiProcess)
|
||||
{
|
||||
int const numProcesses = isMultiProcess ? totalRanks : 1;
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
|
||||
|
||||
for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
|
||||
{
|
||||
ncclDataType_t const dataType = dataTypes[dataIdx];
|
||||
|
||||
// Set scalars per rank
|
||||
PtrUnion scalarsPerRank;
|
||||
scalarsPerRank.AllocateCpuMem(totalRanks * DataTypeToBytes(dataType));
|
||||
for (int i = 0; i < totalRanks; i++)
|
||||
{
|
||||
double F = i;
|
||||
scalarsPerRank.Set(dataType, i, i, F);
|
||||
}
|
||||
|
||||
// Test various scalar residence modes
|
||||
for (int scalarMode = 0; scalarMode <= 1 && isCorrect; ++scalarMode)
|
||||
{
|
||||
if (testBed.ev.showNames)
|
||||
INFO("%s process %2d-ranks AllReduce (custom-scalar Mode %d %s)\n",
|
||||
isMultiProcess ? "Multi " : "Single",
|
||||
totalRanks, scalarMode, ncclDataTypeNames[dataType]);
|
||||
|
||||
for (int i = 0; i < numElements.size() && isCorrect; ++i)
|
||||
{
|
||||
testBed.SetCollectiveArgs(funcType, dataType, redOp, root,
|
||||
numElements[i], numElements[i],
|
||||
-1, -1, scalarsPerRank, scalarMode);
|
||||
// For performance, only allocate and prepare data on largest size
|
||||
if (i == 0)
|
||||
{
|
||||
testBed.AllocateMem(inPlace, useManagedMem);
|
||||
testBed.PrepareData();
|
||||
}
|
||||
testBed.ExecuteCollectives();
|
||||
testBed.ValidateResults(isCorrect);
|
||||
}
|
||||
testBed.DeallocateMem();
|
||||
}
|
||||
}
|
||||
testBed.DestroyComms();
|
||||
}
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(AllToAll, ManagedMem)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllToAll};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclUint8, ncclUint32, ncclUint64};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {true};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(AllToAll, OutOfPlace)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllToAll};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(Broadcast, InPlace)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollBroadcast};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclInt8, ncclInt32, ncclInt64};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {true};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(Broadcast, ManagedMem)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollBroadcast};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclUint8, ncclUint32, ncclUint64};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {true};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(Broadcast, OutOfPlace)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollBroadcast};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {1};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
+76
-62
@@ -3,87 +3,103 @@ cmake_minimum_required(VERSION 2.8.12)
|
||||
|
||||
if(BUILD_TESTS)
|
||||
|
||||
message("Going to build unit tests (Installed in /test/UnitTests)")
|
||||
message("Building unit tests (Installed in /test/UnitTests)")
|
||||
|
||||
find_program(CHRPATH chrpath)
|
||||
if(NOT CHRPATH)
|
||||
message(FATAL_ERROR "chrpath is required for UnitTests. Please install (e.g. sudo apt-get install chrpath)")
|
||||
endif()
|
||||
|
||||
include_directories(${GTEST_INCLUDE_DIRS})
|
||||
find_package(hsa-runtime64 PATHS /opt/rocm )
|
||||
if(${hsa-runtime64_FOUND})
|
||||
message("hsa-runtime64 found @ ${hsa-runtime64_DIR} ")
|
||||
else()
|
||||
message("find_package did NOT find hsa-runtime64, finding it the OLD Way")
|
||||
message("Looking for header files in ${ROCR_INC_DIR}")
|
||||
message("Looking for library files in ${ROCR_LIB_DIR}")
|
||||
|
||||
# Search for ROCr header file in user defined locations
|
||||
find_path(ROCR_HDR hsa.h PATHS ${ROCR_INC_DIR} "/opt/rocm" PATH_SUFFIXES include/hsa REQUIRED)
|
||||
INCLUDE_DIRECTORIES(${ROCR_HDR})
|
||||
|
||||
# Search for ROCr library file in user defined locations
|
||||
find_library(ROCR_LIB ${CORE_RUNTIME_TARGET} PATHS ${ROCR_LIB_DIR} "/opt/rocm" PATH_SUFFIXES lib lib64 REQUIRED)
|
||||
endif()
|
||||
|
||||
include_directories(${GTEST_INCLUDE_DIRS} ./common)
|
||||
|
||||
# Collect testing framework source files
|
||||
set (COMMON_SOURCE_FILES
|
||||
common/main.cpp
|
||||
common/CollectiveArgs.cpp
|
||||
common/EnvVars.cpp
|
||||
common/PrepDataFuncs.cpp
|
||||
common/PtrUnion.cpp
|
||||
common/TestBed.cpp
|
||||
common/TestBedChild.cpp
|
||||
)
|
||||
|
||||
# Collect source files for tests
|
||||
if(BUILD_ALLREDUCE_ONLY)
|
||||
set(TEST_SOURCES_SINGLE_PROCESS
|
||||
test_AllReduce.cpp
|
||||
test_AllReduceAbort.cpp
|
||||
test_AllReduceGroup.cpp
|
||||
set(TEST_SOURCE_FILES
|
||||
AllReduce_Clique.cpp
|
||||
AllReduce_GroupCall.cpp
|
||||
AllReduce_InPlace.cpp
|
||||
AllReduce_ManagedMem.cpp
|
||||
AllReduce_OutOfPlace.cpp
|
||||
AllReduce_PreMultScalar.cpp
|
||||
)
|
||||
else()
|
||||
# Collect source files for tests
|
||||
set(TEST_SOURCES_SINGLE_PROCESS
|
||||
test_AllGather.cpp
|
||||
test_AllReduce.cpp
|
||||
test_AllReduceGroup.cpp
|
||||
test_Broadcast.cpp
|
||||
test_Reduce.cpp
|
||||
test_ReduceScatter.cpp
|
||||
test_GroupCalls.cpp
|
||||
test_CombinedCalls.cpp
|
||||
test_AllReduceAbort.cpp
|
||||
test_BroadcastAbort.cpp
|
||||
test_Scatter.cpp
|
||||
test_Gather.cpp
|
||||
test_AllToAll.cpp
|
||||
test_AllToAllv.cpp
|
||||
)
|
||||
set(TEST_SOURCE_FILES
|
||||
#AllReduce
|
||||
AllReduce_Clique.cpp
|
||||
AllReduce_GroupCall.cpp
|
||||
AllReduce_InPlace.cpp
|
||||
AllReduce_ManagedMem.cpp
|
||||
AllReduce_OutOfPlace.cpp
|
||||
AllReduce_PreMultScalar.cpp
|
||||
#AllGather
|
||||
AllGather_InPlace.cpp
|
||||
AllGather_ManagedMem.cpp
|
||||
AllGather_OutOfPlace.cpp
|
||||
#AllToAll
|
||||
AllToAll_OutOfPlace.cpp
|
||||
AllToAll_ManagedMem.cpp
|
||||
#Broadcast
|
||||
Broadcast_InPlace.cpp
|
||||
Broadcast_ManagedMem.cpp
|
||||
Broadcast_OutOfPlace.cpp
|
||||
#Reduce
|
||||
Reduce_InPlace.cpp
|
||||
Reduce_ManagedMem.cpp
|
||||
Reduce_OutOfPlace.cpp
|
||||
#ReduceScatter
|
||||
ReduceScatter_InPlace.cpp
|
||||
ReduceScatter_ManagedMem.cpp
|
||||
ReduceScatter_OutOfPlace.cpp
|
||||
#Scatter
|
||||
Scatter_InPlace.cpp
|
||||
Scatter_ManagedMem.cpp
|
||||
Scatter_OutOfPlace.cpp
|
||||
#Gather
|
||||
Gather_InPlace.cpp
|
||||
Gather_ManagedMem.cpp
|
||||
Gather_OutOfPlace.cpp
|
||||
)
|
||||
endif()
|
||||
if(BUILD_ALLREDUCE_ONLY)
|
||||
set(TEST_SOURCES_MULTI_PROCESS
|
||||
test_AllReduceMultiProcess.cpp
|
||||
test_AllReduceGroupMultiProcess.cpp
|
||||
)
|
||||
else()
|
||||
set(TEST_SOURCES_MULTI_PROCESS
|
||||
test_AllGatherMultiProcess.cpp
|
||||
test_AllReduceMultiProcess.cpp
|
||||
test_AllReduceGroupMultiProcess.cpp
|
||||
test_AllToAllMultiProcess.cpp
|
||||
test_BroadcastMultiProcess.cpp
|
||||
test_CombinedCallsMultiProcess.cpp
|
||||
test_GatherMultiProcess.cpp
|
||||
test_GroupCallsMultiProcess.cpp
|
||||
test_ReduceMultiProcess.cpp
|
||||
test_ReduceScatterMultiProcess.cpp
|
||||
test_ScatterMultiProcess.cpp
|
||||
)
|
||||
endif()
|
||||
add_executable(UnitTests ${TEST_SOURCES_SINGLE_PROCESS})
|
||||
|
||||
add_executable(UnitTests ${COMMON_SOURCE_FILES} ${TEST_SOURCE_FILES})
|
||||
target_include_directories(UnitTests PRIVATE ${ROCM_PATH} ${GTEST_INCLUDE_DIRS})
|
||||
target_link_libraries(UnitTests PRIVATE ${GTEST_BOTH_LIBRARIES})
|
||||
target_link_libraries(UnitTests PRIVATE hip::host hip::device)
|
||||
|
||||
add_executable(UnitTestsMultiProcess ${TEST_SOURCES_MULTI_PROCESS})
|
||||
target_include_directories(UnitTestsMultiProcess PRIVATE ${ROCM_PATH} ${GTEST_INCLUDE_DIRS})
|
||||
target_link_libraries(UnitTestsMultiProcess PRIVATE ${GTEST_BOTH_LIBRARIES})
|
||||
target_link_libraries(UnitTestsMultiProcess PRIVATE hip::host hip::device)
|
||||
|
||||
find_program( rocminfo_executable rocminfo )
|
||||
execute_process(COMMAND bash "-c" "${rocminfo_executable} | grep 'Device Type' | grep GPU | wc -l | tr -d '\n'" OUTPUT_VARIABLE gtest_num_gpus)
|
||||
if(${gtest_num_gpus} EQUAL "0" OR ${gtest_num_gpus} EQUAL "1")
|
||||
set(gtest_num_gpus "2")
|
||||
endif()
|
||||
target_compile_options(UnitTests PRIVATE -DGTESTS_NUM_GPUS=${gtest_num_gpus})
|
||||
target_link_libraries(UnitTests PRIVATE hip::host hip::device hsa-runtime64::hsa-runtime64)
|
||||
|
||||
# UnitTests using static library of rccl requires passing rccl
|
||||
# through -l and -L instead of command line input.
|
||||
if(BUILD_STATIC)
|
||||
add_dependencies(UnitTests rccl)
|
||||
target_link_libraries(UnitTests PRIVATE dl rt numa -lrccl -L${CMAKE_BINARY_DIR} -lrocm_smi64 -L${ROCM_PATH}/rocm_smi/lib)
|
||||
add_dependencies(UnitTestsMultiProcess rccl)
|
||||
target_link_libraries(UnitTestsMultiProcess PRIVATE dl rt numa -lrccl -L${CMAKE_BINARY_DIR} -lrocm_smi64 -L${ROCM_PATH}/rocm_smi/lib)
|
||||
else()
|
||||
target_link_libraries(UnitTests PRIVATE rccl)
|
||||
target_link_libraries(UnitTestsMultiProcess PRIVATE rt rccl)
|
||||
endif()
|
||||
# HIPCC adds /opt/rocm/lib as RPATH, even though the install process is supposed to
|
||||
# remove RPATH. It also occurs before any user-specified rpath, which effectively overrides the user rpath.
|
||||
@@ -91,10 +107,8 @@ if(BUILD_TESTS)
|
||||
if (CMAKE_INSTALL_PREFIX MATCHES "${ROCM_PATH}")
|
||||
# install_prefix/CMAKE_INSTALL_PREFIX was not explicitly specified, so look in build/release
|
||||
add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:${ROCM_PATH}/lib ${CMAKE_BINARY_DIR}/test/UnitTests)
|
||||
add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:${ROCM_PATH}/lib ${CMAKE_BINARY_DIR}/test/UnitTestsMultiProcess)
|
||||
else()
|
||||
add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTests)
|
||||
add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTestsMultiProcess)
|
||||
endif()
|
||||
else()
|
||||
message("Not building unit tests")
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(Gather, InPlace)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollGather};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclInt8, ncclInt32, ncclInt64};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {true};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(Gather, ManagedMem)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollGather};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclUint8, ncclUint32, ncclUint64};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {1};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {true};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(Gather, OutOfPlace)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollGather};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {1};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(ReduceScatter, InPlace)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollReduceScatter};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclInt8, ncclInt32, ncclInt64};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum, ncclProd};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {true};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(ReduceScatter, ManagedMem)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollReduceScatter};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclUint8, ncclUint32, ncclUint64};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {true};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(ReduceScatter, OutOfPlace)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollReduceScatter};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclMin, ncclMax, ncclAvg};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(Reduce, InPlace)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollReduce};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclInt8, ncclInt32, ncclInt64};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum, ncclProd};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {true};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(Reduce, ManagedMem)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollReduce};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclUint8, ncclUint32, ncclUint64};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {true};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(Reduce, OutOfPlace)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollReduce};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclMin, ncclMax, ncclAvg};
|
||||
std::vector<int> const roots = {1};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(Scatter, InPlace)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollScatter};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclInt8, ncclInt32, ncclInt64};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {true};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(Scatter, ManagedMem)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollScatter};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclUint8, ncclUint32, ncclUint64};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {true};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "TestBed.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TEST(Scatter, OutOfPlace)
|
||||
{
|
||||
TestBed testBed;
|
||||
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollScatter};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {1};
|
||||
std::vector<int> const numElements = {1048576, 53327, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
|
||||
testBed.Finalize();
|
||||
}
|
||||
}
|
||||
@@ -1,63 +0,0 @@
|
||||
#ifndef TESTCHECKS_HPP
|
||||
#define TESTCHECKS_HPP
|
||||
|
||||
#define HIP_CALL(x) ASSERT_EQ(x, hipSuccess)
|
||||
#define NCCL_CALL(x) ASSERT_EQ(x, ncclSuccess)
|
||||
|
||||
#define SYSCHECK_TEST(call, name) do { \
|
||||
int retval; \
|
||||
SYSCHECKVAL_TEST(call, name, retval); \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKVAL_TEST(call, name, retval) do { \
|
||||
SYSCHECKSYNC_TEST(call, name, retval); \
|
||||
if (retval == -1) { \
|
||||
printf("Call to %s failed : %s\n", name, strerror(errno)); \
|
||||
fflush(stdout); \
|
||||
return ncclSystemError; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECK_GOTO_TEST(call, name, label) do { \
|
||||
int retval; \
|
||||
SYSCHECKVAL_GOTO_TEST(call, name, retval, label); \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKVAL_GOTO_TEST(call, name, retval, label) do { \
|
||||
SYSCHECKSYNC_TEST(call, name, retval); \
|
||||
if (retval == -1) { \
|
||||
printf("Call to %s failed : %s\n", name, strerror(errno)); \
|
||||
fflush(stdout); \
|
||||
goto label; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define SYSCHECKSYNC_TEST(call, name, retval) do { \
|
||||
retval = call; \
|
||||
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
|
||||
} else { \
|
||||
break; \
|
||||
} \
|
||||
} while(true)
|
||||
|
||||
#define NCCLCHECK_BARRIER_TEST(call, name, rank) do { \
|
||||
ncclResult_t retval; \
|
||||
retval = call; \
|
||||
if (retval != ncclSuccess) { \
|
||||
printf("Rank %d call to %s failed : %s\n", rank, name, strerror(errno)); \
|
||||
fflush(stdout); \
|
||||
return; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#define NCCLCHECK_TEST(call, name) do { \
|
||||
ncclResult_t retval; \
|
||||
retval = call; \
|
||||
if (retval != ncclSuccess) { \
|
||||
printf("Call to %s failed : %s\n", name, strerror(errno)); \
|
||||
fflush(stdout); \
|
||||
return retval; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,282 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "CollectiveArgs.hpp"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
ErrCode CollectiveArgs::SetArgs(int const globalRank,
|
||||
int const totalRanks,
|
||||
int const deviceId,
|
||||
ncclFunc_t const funcType,
|
||||
ncclDataType_t const dataType,
|
||||
ncclRedOp_t const redOp,
|
||||
int const root,
|
||||
size_t const numInputElements,
|
||||
size_t const numOutputElements,
|
||||
ScalarTransport const scalarTransport,
|
||||
int const scalarMode)
|
||||
{
|
||||
// Free scalar based on previous scalarMode
|
||||
if (scalarMode != -1)
|
||||
{
|
||||
if (this->localScalar.ptr != nullptr)
|
||||
{
|
||||
if (this->scalarMode == 0) this->localScalar.FreeGpuMem();
|
||||
if (this->scalarMode == 1) hipHostFree(this->localScalar.ptr);
|
||||
}
|
||||
}
|
||||
|
||||
this->globalRank = globalRank;
|
||||
this->totalRanks = totalRanks;
|
||||
this->deviceId = deviceId;
|
||||
this->funcType = funcType;
|
||||
this->dataType = dataType;
|
||||
this->redOp = redOp;
|
||||
this->root = root;
|
||||
this->numInputElements = numInputElements;
|
||||
this->numOutputElements = numOutputElements;
|
||||
this->scalarTransport = scalarTransport;
|
||||
this->scalarMode = scalarMode;
|
||||
|
||||
if (scalarMode != -1)
|
||||
{
|
||||
size_t const numBytes = DataTypeToBytes(dataType);
|
||||
if (scalarMode == ncclScalarDevice)
|
||||
{
|
||||
CHECK_CALL(this->localScalar.AllocateGpuMem(numBytes));
|
||||
CHECK_HIP(hipMemcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes),
|
||||
numBytes, hipMemcpyHostToDevice));
|
||||
}
|
||||
else if (scalarMode == ncclScalarHostImmediate)
|
||||
{
|
||||
CHECK_HIP(hipHostMalloc(&this->localScalar.ptr, numBytes, 0));
|
||||
memcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes), numBytes);
|
||||
}
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode CollectiveArgs::AllocateMem(bool const inPlace,
|
||||
bool const useManagedMem)
|
||||
{
|
||||
this->numInputBytesAllocated = this->numInputElements * DataTypeToBytes(this->dataType);
|
||||
this->numOutputBytesAllocated = this->numOutputElements * DataTypeToBytes(this->dataType);
|
||||
this->numInputElementsAllocated = this->numInputElements;
|
||||
this->numOutputElementsAllocated = this->numOutputElements;
|
||||
this->inPlace = inPlace;
|
||||
this->useManagedMem = useManagedMem;
|
||||
|
||||
if (hipSetDevice(this->deviceId) != hipSuccess)
|
||||
{
|
||||
ERROR("Unable to call hipSetDevice to set to GPU %d\n", this->deviceId);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
|
||||
if (inPlace)
|
||||
{
|
||||
if (this->funcType == ncclCollScatter)
|
||||
{
|
||||
CHECK_CALL(this->inputGpu.AllocateGpuMem(this->numInputBytesAllocated, useManagedMem));
|
||||
this->outputGpu.Attach(this->inputGpu.U1 + (this->globalRank * this->numOutputBytesAllocated));
|
||||
}
|
||||
else if (this->funcType == ncclCollGather)
|
||||
{
|
||||
CHECK_CALL(this->outputGpu.AllocateGpuMem(this->numOutputBytesAllocated, useManagedMem));
|
||||
this->inputGpu.Attach(this->outputGpu.U1 + (this->globalRank * this->numInputBytesAllocated));
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t const numBytes = std::max(this->numInputBytesAllocated, this->numOutputBytesAllocated);
|
||||
CHECK_CALL(this->inputGpu.AllocateGpuMem(numBytes, useManagedMem));
|
||||
this->outputGpu.Attach(this->inputGpu.ptr);
|
||||
}
|
||||
CHECK_CALL(this->expected.AllocateCpuMem(this->numOutputBytesAllocated));
|
||||
}
|
||||
else
|
||||
{
|
||||
CHECK_CALL(this->inputGpu.AllocateGpuMem(this->numInputBytesAllocated, useManagedMem));
|
||||
CHECK_CALL(this->outputGpu.AllocateGpuMem(this->numOutputBytesAllocated, useManagedMem));
|
||||
CHECK_CALL(this->expected.AllocateCpuMem(this->numOutputBytesAllocated));
|
||||
}
|
||||
CHECK_CALL(this->outputCpu.AllocateCpuMem(this->numOutputBytesAllocated));
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode CollectiveArgs::PrepareData(CollFuncPtr const prepareDataFunc)
|
||||
{
|
||||
CollFuncPtr prepFunc = (prepareDataFunc == nullptr ? DefaultPrepareDataFunc : prepareDataFunc);
|
||||
return prepFunc(*this);
|
||||
}
|
||||
|
||||
ErrCode CollectiveArgs::ValidateResults()
|
||||
{
|
||||
// Ignore non-root outputs for collectives with a root
|
||||
if (CollectiveArgs::UsesRoot(this->funcType) && this->root != this->globalRank) return TEST_SUCCESS;
|
||||
|
||||
size_t const numOutputBytes = (this->numOutputElements * DataTypeToBytes(this->dataType));
|
||||
|
||||
CHECK_HIP(hipMemcpy(this->outputCpu.ptr, this->outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost));
|
||||
|
||||
bool isMatch = true;
|
||||
CHECK_CALL(this->outputCpu.IsEqual(this->dataType,
|
||||
this->numOutputElements,
|
||||
this->expected,
|
||||
true,
|
||||
isMatch));
|
||||
if (!isMatch) ERROR("Mismatch for %s\n", this->GetDescription().c_str());
|
||||
return isMatch ? TEST_SUCCESS : TEST_FAIL;
|
||||
}
|
||||
|
||||
ErrCode CollectiveArgs::DeallocateMem()
|
||||
{
|
||||
// If in-place, either only inputGpu or outputGpu was allocated
|
||||
if (this->inPlace)
|
||||
{
|
||||
if (this->funcType == ncclCollGather)
|
||||
this->outputGpu.FreeGpuMem();
|
||||
else
|
||||
this->inputGpu.FreeGpuMem();
|
||||
}
|
||||
else
|
||||
{
|
||||
this->inputGpu.FreeGpuMem();
|
||||
this->outputGpu.FreeGpuMem();
|
||||
}
|
||||
|
||||
this->outputCpu.FreeCpuMem();
|
||||
this->expected.FreeCpuMem();
|
||||
|
||||
if (this->localScalar.ptr != nullptr)
|
||||
{
|
||||
if (this->scalarMode == 0) this->localScalar.FreeGpuMem();
|
||||
if (this->scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr));
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
std::string CollectiveArgs::GetDescription() const
|
||||
{
|
||||
std::stringstream ss;
|
||||
|
||||
ss << "(Rank " << this->globalRank << ") ";
|
||||
switch (this->funcType)
|
||||
{
|
||||
case ncclCollBroadcast: ss << "ncclBroadcast"; break;
|
||||
case ncclCollReduce: ss << "ncclReduce"; break;
|
||||
case ncclCollAllGather: ss << "ncclAllGather"; break;
|
||||
case ncclCollReduceScatter: ss << "ncclReduceScatter"; break;
|
||||
case ncclCollAllReduce: ss << "ncclAllReduce"; break;
|
||||
case ncclCollGather: ss << "ncclGather"; break;
|
||||
case ncclCollScatter: ss << "ncclScatter"; break;
|
||||
case ncclCollAllToAll: ss << "ncclAllToAll"; break;
|
||||
case ncclCollSend: ss << "ncclSend"; break;
|
||||
case ncclCollRecv: ss << "ncclRevv"; break;
|
||||
default: ss << "[Unknown]"; break;
|
||||
}
|
||||
|
||||
ss << " " << ncclDataTypeNames[this->dataType] << " ";
|
||||
if (this->funcType == ncclCollReduce ||
|
||||
this->funcType == ncclCollReduceScatter ||
|
||||
this->funcType == ncclCollAllReduce)
|
||||
{
|
||||
if (this->redOp < ncclNumOps)
|
||||
{
|
||||
ss << ncclRedOpNames[this->redOp] << " ";
|
||||
}
|
||||
else
|
||||
{
|
||||
ss << "CustomScalar ";
|
||||
PtrUnion scalarsPerRank;
|
||||
scalarsPerRank.Attach(scalarsPerRank.ptr);
|
||||
switch (this->dataType)
|
||||
{
|
||||
case ncclInt8: ss << scalarsPerRank.I1[this->globalRank]; break;
|
||||
case ncclUint8: ss << scalarsPerRank.U1[this->globalRank]; break;
|
||||
case ncclInt32: ss << scalarsPerRank.I4[this->globalRank]; break;
|
||||
case ncclUint32: ss << scalarsPerRank.U4[this->globalRank]; break;
|
||||
case ncclInt64: ss << scalarsPerRank.I8[this->globalRank]; break;
|
||||
case ncclUint64: ss << scalarsPerRank.U8[this->globalRank]; break;
|
||||
case ncclFloat32: ss << scalarsPerRank.F4[this->globalRank]; break;
|
||||
case ncclFloat64: ss << scalarsPerRank.F8[this->globalRank]; break;
|
||||
case ncclBfloat16: ss << scalarsPerRank.B2[this->globalRank]; break;
|
||||
default: ss << "(UNKNOWN)";
|
||||
}
|
||||
ss << " ";
|
||||
}
|
||||
}
|
||||
|
||||
if (this->funcType == ncclCollBroadcast ||
|
||||
this->funcType == ncclCollReduce ||
|
||||
this->funcType == ncclCollGather ||
|
||||
this->funcType == ncclCollScatter)
|
||||
{
|
||||
ss << "Root " << this->root << " ";
|
||||
}
|
||||
|
||||
if (this->funcType == ncclCollSend ||
|
||||
this->funcType == ncclCollRecv)
|
||||
{
|
||||
ss << "Peer " << this->root << " ";
|
||||
}
|
||||
|
||||
ss << "#In: " << this->numInputElements;
|
||||
ss << " #Out: " << this->numOutputElements;
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
void CollectiveArgs::GetNumElementsForFuncType(ncclFunc_t const funcType,
|
||||
int const N,
|
||||
int const totalRanks,
|
||||
int* numInputElements,
|
||||
int* numOutputElements)
|
||||
{
|
||||
switch (funcType)
|
||||
{
|
||||
case ncclCollBroadcast:
|
||||
case ncclCollReduce:
|
||||
case ncclCollAllReduce:
|
||||
*numInputElements = N;
|
||||
*numOutputElements = N;
|
||||
break;
|
||||
case ncclCollGather:
|
||||
case ncclCollAllGather:
|
||||
*numInputElements = N;
|
||||
*numOutputElements = totalRanks * N;
|
||||
break;
|
||||
case ncclCollScatter:
|
||||
case ncclCollReduceScatter:
|
||||
*numInputElements = totalRanks * N;
|
||||
*numOutputElements = N;
|
||||
break;
|
||||
case ncclCollAllToAll:
|
||||
*numInputElements = totalRanks * N;
|
||||
*numOutputElements = totalRanks * N;
|
||||
break;
|
||||
default:
|
||||
*numInputElements = N;
|
||||
*numOutputElements = N;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
bool CollectiveArgs::UsesReduce(ncclFunc_t const funcType)
|
||||
{
|
||||
return (funcType == ncclCollReduce ||
|
||||
funcType == ncclCollAllReduce ||
|
||||
funcType == ncclCollReduceScatter);
|
||||
}
|
||||
|
||||
bool CollectiveArgs::UsesRoot(ncclFunc_t const funcType)
|
||||
{
|
||||
return (funcType == ncclCollBroadcast ||
|
||||
funcType == ncclCollReduce ||
|
||||
funcType == ncclCollGather ||
|
||||
funcType == ncclCollScatter);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,151 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
b
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#pragma once
|
||||
#include "PtrUnion.hpp"
|
||||
#include "PrepDataFuncs.hpp"
|
||||
#include "rccl.h"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
// Enumeration of all collective functions currently supported
|
||||
typedef enum
|
||||
{
|
||||
ncclCollBroadcast = 0,
|
||||
ncclCollReduce,
|
||||
ncclCollAllGather,
|
||||
ncclCollReduceScatter,
|
||||
ncclCollAllReduce,
|
||||
ncclCollGather,
|
||||
ncclCollScatter,
|
||||
ncclCollAllToAll,
|
||||
ncclCollSend,
|
||||
ncclCollRecv,
|
||||
ncclNumFuncs
|
||||
} ncclFunc_t;
|
||||
|
||||
char const ncclFuncNames[ncclNumFuncs][32] =
|
||||
{
|
||||
"Broadcast",
|
||||
"Reduce",
|
||||
"AllGather",
|
||||
"ReduceScatter",
|
||||
"AllReduce",
|
||||
"Gather",
|
||||
"Scatter",
|
||||
"AllToAll",
|
||||
"Send",
|
||||
"Recv"
|
||||
};
|
||||
|
||||
char const ncclDataTypeNames[ncclNumTypes][32] =
|
||||
{
|
||||
"ncclInt8",
|
||||
"ncclUint8",
|
||||
"ncclInt32",
|
||||
"ncclUint32",
|
||||
"ncclInt64",
|
||||
"ncclUint64",
|
||||
"ncclFloat16",
|
||||
"ncclFloat32",
|
||||
"ncclFloat64",
|
||||
"ncclBfloat16"
|
||||
};
|
||||
|
||||
char const ncclRedOpNames[ncclNumOps][32] =
|
||||
{
|
||||
"sum",
|
||||
"prod",
|
||||
"max",
|
||||
"min",
|
||||
"avg"
|
||||
};
|
||||
|
||||
class CollectiveArgs;
|
||||
|
||||
#define MAX_RANKS 32
|
||||
struct ScalarTransport
|
||||
{
|
||||
char ptr[MAX_RANKS * sizeof(double)];
|
||||
};
|
||||
|
||||
// Function pointer for functions that operate on CollectiveArgs
|
||||
// e.g. For filling input / computing expected results
|
||||
typedef ErrCode (*CollFuncPtr)(CollectiveArgs &);
|
||||
|
||||
class CollectiveArgs
|
||||
{
|
||||
public:
|
||||
// Arguments to execute
|
||||
int globalRank;
|
||||
int totalRanks;
|
||||
int deviceId;
|
||||
ncclFunc_t funcType;
|
||||
ncclDataType_t dataType;
|
||||
ncclRedOp_t redOp;
|
||||
int root; // Used as "peer" for Send/Recv
|
||||
size_t numInputElements;
|
||||
size_t numOutputElements;
|
||||
ScalarTransport scalarTransport; // Used for custom reduction operators
|
||||
PtrUnion localScalar;
|
||||
int scalarMode; // -1 if scalar not used
|
||||
|
||||
// Data
|
||||
PtrUnion inputGpu;
|
||||
PtrUnion outputGpu;
|
||||
PtrUnion outputCpu;
|
||||
PtrUnion expected;
|
||||
bool inPlace;
|
||||
bool useManagedMem;
|
||||
size_t numInputBytesAllocated;
|
||||
size_t numOutputBytesAllocated;
|
||||
size_t numInputElementsAllocated;
|
||||
size_t numOutputElementsAllocated;
|
||||
|
||||
// Set collective arguments
|
||||
ErrCode SetArgs(int const globalRank,
|
||||
int const totalRanks,
|
||||
int const deviceId,
|
||||
ncclFunc_t const funcType,
|
||||
ncclDataType_t const dataType,
|
||||
ncclRedOp_t const redOp,
|
||||
int const root,
|
||||
size_t const numInputElements,
|
||||
size_t const numOutputElements,
|
||||
ScalarTransport const scalarsPerRank,
|
||||
int const scalarMode = -1);
|
||||
|
||||
// Allocates GPU memory for input/output and CPU memory for expected
|
||||
// When inPlace is true, input and output share the same memory
|
||||
ErrCode AllocateMem(bool const inPlace,
|
||||
bool const useManagedMem);
|
||||
|
||||
// Execute the provided data preparation function to fill input and compute expected results
|
||||
ErrCode PrepareData(CollFuncPtr const prepareDataFunc);
|
||||
|
||||
// Compare outputs to expected values
|
||||
ErrCode ValidateResults();
|
||||
|
||||
// Deallocate memory
|
||||
ErrCode DeallocateMem();
|
||||
|
||||
// Provide a description for the current collective arguments
|
||||
std::string GetDescription() const;
|
||||
|
||||
// Returns the number of inputs/outputs based on collective function type
|
||||
static void GetNumElementsForFuncType(ncclFunc_t const funcType,
|
||||
int const N,
|
||||
int const totalRanks,
|
||||
int* numInputElements,
|
||||
int* numOutputElements);
|
||||
|
||||
// Returns true if collective function performs reduction
|
||||
static bool UsesReduce(ncclFunc_t const funcType);
|
||||
|
||||
// Returns true if collective function utilizes a root rank
|
||||
static bool UsesRoot(ncclFunc_t const funcType);
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,161 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "EnvVars.hpp"
|
||||
#include "CollectiveArgs.hpp"
|
||||
#include <cstdlib>
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
int const UT_SINGLE_PROCESS = (1<<0);
|
||||
int const UT_MULTI_PROCESS = (1<<1);
|
||||
|
||||
hsa_status_t CountGpus(hsa_agent_t agent, void* data)
|
||||
{
|
||||
int* currCount = (int*)data;
|
||||
hsa_device_type_t device;
|
||||
hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device);
|
||||
if (device == HSA_DEVICE_TYPE_GPU)
|
||||
*currCount = *currCount + 1;
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
EnvVars::EnvVars()
|
||||
{
|
||||
// Collect number of GPUs available
|
||||
// NOTE: Cannot use HIP call prior to launching child processes via fork so use HSA
|
||||
int numDevicesAvailable = 0;
|
||||
hsa_init();
|
||||
hsa_iterate_agents(CountGpus, &numDevicesAvailable);
|
||||
hsa_shut_down();
|
||||
|
||||
showNames = GetEnvVar("UT_SHOW_NAMES" , 1);
|
||||
minGpus = GetEnvVar("UT_MIN_GPUS" , 2);
|
||||
maxGpus = GetEnvVar("UT_MAX_GPUS" , numDevicesAvailable);
|
||||
processMask = GetEnvVar("UT_PROCESS_MASK", UT_SINGLE_PROCESS | UT_MULTI_PROCESS);
|
||||
verbose = GetEnvVar("UT_VERBOSE" , 0);
|
||||
printValues = GetEnvVar("UT_PRINT_VALUES", 0);
|
||||
|
||||
// Limit number of supported reduction operators to just ncclSum if only allReduce is built
|
||||
#ifdef BUILD_ALLREDUCE_ONLY
|
||||
int numOps = 1;
|
||||
#else
|
||||
int numOps = ncclNumOps;
|
||||
#endif
|
||||
std::vector<std::string> redOpStrings = GetEnvVarsList("UT_REDOPS");
|
||||
for (auto s : redOpStrings)
|
||||
{
|
||||
for (int i = 0; i < numOps; ++i)
|
||||
{
|
||||
if (!strcmp(s.c_str(), ncclRedOpNames[i]))
|
||||
{
|
||||
redOps.push_back((ncclRedOp_t)i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Default back to all ops if no strings are found
|
||||
if (redOps.empty())
|
||||
{
|
||||
for (int i = 0; i < numOps; i++)
|
||||
redOps.push_back((ncclRedOp_t)i);
|
||||
}
|
||||
|
||||
// Limit number of supported datatypes if only allReduce is built
|
||||
std::vector<std::string> dtStrings = GetEnvVarsList("UT_DATATYPES");
|
||||
for (auto s : dtStrings)
|
||||
{
|
||||
for (int i = 0; i < ncclNumTypes; ++i)
|
||||
{
|
||||
if (!strcmp(s.c_str(), ncclDataTypeNames[i]))
|
||||
{
|
||||
#ifdef BUILD_ALLREDUCE_ONLY
|
||||
if (i == ncclFloat32)
|
||||
#endif
|
||||
{
|
||||
dataTypes.push_back((ncclDataType_t)i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Default option if no valid datatypes are found in env var
|
||||
if (dataTypes.empty())
|
||||
{
|
||||
dataTypes.push_back(ncclFloat32);
|
||||
// Skip all but 32-bit floats if only AllReduce is being built
|
||||
#ifndef BUILD_ALLREDUCE_ONLY
|
||||
dataTypes.push_back(ncclInt8);
|
||||
dataTypes.push_back(ncclUint8);
|
||||
dataTypes.push_back(ncclInt32);
|
||||
dataTypes.push_back(ncclUint32);
|
||||
dataTypes.push_back(ncclInt64);
|
||||
dataTypes.push_back(ncclUint64);
|
||||
// Half-precision floats disabled due to lack of host-side support
|
||||
// dataTypes.push_back(ncclFloat16);
|
||||
dataTypes.push_back(ncclFloat32);
|
||||
dataTypes.push_back(ncclFloat64);
|
||||
dataTypes.push_back(ncclBfloat16);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<ncclRedOp_t> const& EnvVars::GetAllSupportedRedOps()
|
||||
{
|
||||
return redOps;
|
||||
}
|
||||
|
||||
std::vector<ncclDataType_t> const& EnvVars::GetAllSupportedDataTypes()
|
||||
{
|
||||
return dataTypes;
|
||||
}
|
||||
|
||||
int EnvVars::GetEnvVar(std::string const varname, int defaultValue)
|
||||
{
|
||||
if (getenv(varname.c_str()))
|
||||
return atoi(getenv(varname.c_str()));
|
||||
return defaultValue;
|
||||
};
|
||||
|
||||
std::vector<std::string> EnvVars::GetEnvVarsList(std::string const varname)
|
||||
{
|
||||
std::vector<std::string> result;
|
||||
if (getenv(varname.c_str()))
|
||||
{
|
||||
char* token = strtok(getenv(varname.c_str()), ",;");
|
||||
while (token != NULL)
|
||||
{
|
||||
result.push_back(token);
|
||||
token = strtok(NULL, ",;");
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void EnvVars::ShowConfig()
|
||||
{
|
||||
std::vector<std::pair<std::string, std::string>> supported =
|
||||
{
|
||||
std::make_pair("UT_SHOW_NAMES" , "Show test case names"),
|
||||
std::make_pair("UT_MIN_GPUS" , "Minimum number of GPUs to use"),
|
||||
std::make_pair("UT_MAX_GPUS" , "Maximum number of GPUs to use"),
|
||||
std::make_pair("UT_PROCESS_MASK", "Whether to run single/multi process"),
|
||||
std::make_pair("UT_VERBOSE" , "Show verbose unit test output"),
|
||||
std::make_pair("UT_REDOPS" , "List of reduction ops to test"),
|
||||
std::make_pair("UT_DATATYPES" , "List of datatypes to test"),
|
||||
std::make_pair("UT_PRINT_VALUES", "Print array values (# of values to print, < 0 for all)")
|
||||
};
|
||||
|
||||
printf("================================================================================\n");
|
||||
printf(" Environment variables:\n");
|
||||
for (auto p : supported)
|
||||
{
|
||||
printf(" - %-20s %-40s %s\n", p.first.c_str(), p.second.c_str(),
|
||||
getenv(p.first.c_str()) ? getenv(p.first.c_str()) : "<unset>");
|
||||
}
|
||||
printf("================================================================================\n");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#pragma once
|
||||
#include <hsa/hsa.h>
|
||||
#include <vector>
|
||||
#include "rccl.h"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
// Helper function to count the number of GPUs on system
|
||||
static hsa_status_t CountGpus(hsa_agent_t agent, void* data);
|
||||
|
||||
// Helper class to track environment variables that affect the unit tests
|
||||
class EnvVars
|
||||
{
|
||||
public:
|
||||
bool showNames; // List test case names during run [UT_SHOW_NAMES]
|
||||
int minGpus; // Set the minimum number of GPUs to use [UT_MIN_GPUS]
|
||||
int maxGpus; // Set the maximum number of GPUs to use [UT_MAX_GPUS]
|
||||
int processMask; // Filter single/multi process [UT_PROCESS_MASK]
|
||||
bool verbose; // Show verbose TestBed output for debug [UT_VERBOSE]
|
||||
int printValues; // Print out input/output/expected arrays [UT_PRINT_VALUES]
|
||||
|
||||
// Constructor that parses and collects environment variables
|
||||
EnvVars();
|
||||
|
||||
std::vector<ncclRedOp_t> const& GetAllSupportedRedOps();
|
||||
std::vector<ncclDataType_t> const& GetAllSupportedDataTypes();
|
||||
|
||||
static void ShowConfig();
|
||||
|
||||
protected:
|
||||
std::vector<ncclRedOp_t> redOps; // Supported reduction ops [UT_REDOPS]
|
||||
std::vector<ncclDataType_t> dataTypes; // Support datatypes [UT_DATATYPES]
|
||||
|
||||
// Helper functions to parse environment variables
|
||||
int GetEnvVar(std::string const varname, int defaultValue);
|
||||
std::vector<std::string> GetEnvVarsList(std::string const varname);
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#pragma once
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
typedef enum
|
||||
{
|
||||
TEST_SUCCESS = 0,
|
||||
TEST_FAIL = 1
|
||||
} ErrCode;
|
||||
|
||||
#define ERROR(...) printf("\033[0;31m" "[ ERROR ] " "\033[0m" __VA_ARGS__)
|
||||
#define INFO(...) printf("[ INFO ] " __VA_ARGS__)
|
||||
|
||||
#define CHECK_CALL(func) \
|
||||
{ \
|
||||
ErrCode status = func; \
|
||||
if (status != TEST_SUCCESS) \
|
||||
{ \
|
||||
ERROR("Error in call %s\n", #func); \
|
||||
return status; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CHECK_HIP(func) \
|
||||
{ \
|
||||
hipError_t error = (func); \
|
||||
if (error != hipSuccess) \
|
||||
{ \
|
||||
fprintf(stderr, "\033[0;33" "[ ERROR ] HIP error: %s\n" "\033[m", hipGetErrorString(error)); \
|
||||
return TEST_FAIL; \
|
||||
} \
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,342 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "CollectiveArgs.hpp"
|
||||
#include "PrepDataFuncs.hpp"
|
||||
#include <cstdio>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
ErrCode DefaultPrepareDataFunc(CollectiveArgs &collArgs)
|
||||
{
|
||||
switch (collArgs.funcType)
|
||||
{
|
||||
case ncclCollBroadcast: return DefaultPrepData_Broadcast(collArgs);
|
||||
case ncclCollReduce: return DefaultPrepData_Reduce(collArgs, false);
|
||||
case ncclCollAllGather: return DefaultPrepData_Gather(collArgs, true);
|
||||
case ncclCollReduceScatter: return DefaultPrepData_ReduceScatter(collArgs);
|
||||
case ncclCollAllReduce: return DefaultPrepData_Reduce(collArgs, true);
|
||||
case ncclCollGather: return DefaultPrepData_Gather(collArgs, false);
|
||||
case ncclCollScatter: return DefaultPrepData_Scatter(collArgs);
|
||||
case ncclCollAllToAll: return DefaultPrepData_AllToAll(collArgs);
|
||||
//case ncclCollSendRecv: return DefaultPrepData_SendRecv(collArgs);
|
||||
default:
|
||||
ERROR("Unknown func type %d\n", collArgs.funcType);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
ErrCode CheckAllocation(CollectiveArgs const& collArgs)
|
||||
{
|
||||
if (collArgs.numInputElements > collArgs.numInputElementsAllocated)
|
||||
{
|
||||
ERROR("Number of input elements (%lu) exceeds the number of allocated input elements (%lu)\n",
|
||||
collArgs.numInputElements, collArgs.numInputElementsAllocated);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
|
||||
if (collArgs.numOutputElements > collArgs.numOutputElementsAllocated)
|
||||
{
|
||||
ERROR("Number of output elements (%lu) exceeds the number of allocated output elements (%lu)\n",
|
||||
collArgs.numOutputElements, collArgs.numOutputElementsAllocated);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode DefaultPrepData_Broadcast(CollectiveArgs &collArgs)
|
||||
{
|
||||
CHECK_CALL(CheckAllocation(collArgs));
|
||||
if (collArgs.numInputElements != collArgs.numOutputElements)
|
||||
{
|
||||
ERROR("Number of input elements must match number of output elements for Broadcast\n");
|
||||
return TEST_FAIL;
|
||||
}
|
||||
|
||||
size_t const numBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
|
||||
|
||||
// Clear output for all ranks (done before filling input in case of in-place)
|
||||
CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numBytes));
|
||||
|
||||
// Only root needs input pattern
|
||||
if (collArgs.globalRank == collArgs.root)
|
||||
CHECK_CALL(collArgs.inputGpu.FillPattern(collArgs.dataType,
|
||||
collArgs.numInputElements,
|
||||
collArgs.root, true));
|
||||
|
||||
// Otherwise all other ranks expected output is the same as input of root
|
||||
return collArgs.expected.FillPattern(collArgs.dataType,
|
||||
collArgs.numInputElements,
|
||||
collArgs.root,
|
||||
false);
|
||||
}
|
||||
|
||||
ErrCode DefaultPrepData_Reduce(CollectiveArgs &collArgs, bool const isAllReduce)
|
||||
{
|
||||
CHECK_CALL(CheckAllocation(collArgs));
|
||||
if (collArgs.numInputElements != collArgs.numOutputElements)
|
||||
{
|
||||
ERROR("Number of input elements must match number of output elements for Reduce\n");
|
||||
return TEST_FAIL;
|
||||
}
|
||||
|
||||
size_t const numBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
|
||||
|
||||
// Clear output for all ranks (done before filling input in case of in-place)
|
||||
CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numBytes));
|
||||
|
||||
// Clear expected buffer for holding reduction
|
||||
PtrUnion result;
|
||||
CHECK_CALL(result.Attach(collArgs.expected));
|
||||
CHECK_CALL(result.ClearCpuMem(numBytes));
|
||||
|
||||
// If average or custom reduction operator is used, perform a summation instead
|
||||
ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp);
|
||||
|
||||
// Loop over each rank and generate their input into a temp buffer, then reduce
|
||||
PtrUnion scalarsPerRank;
|
||||
scalarsPerRank.Attach(collArgs.scalarTransport.ptr);
|
||||
|
||||
PtrUnion tempInputCpu;
|
||||
CHECK_CALL(tempInputCpu.Attach(collArgs.outputCpu));
|
||||
for (int rank = 0; rank < collArgs.totalRanks; ++rank)
|
||||
{
|
||||
// Generate temporary input for this rank
|
||||
CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
|
||||
|
||||
// Copy the pre-scaled input into GPU memory for the correct rank
|
||||
if (rank == collArgs.globalRank)
|
||||
{
|
||||
CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numBytes, hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
// Scale the temporary input by local scalar for this rank
|
||||
// (Used by custom reduction ops)
|
||||
if (collArgs.scalarMode >= 0)
|
||||
{
|
||||
CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements,
|
||||
scalarsPerRank, rank));
|
||||
}
|
||||
|
||||
// Any rank that requires output reduces the scaled-inputs
|
||||
if (isAllReduce || collArgs.root == collArgs.globalRank)
|
||||
{
|
||||
if (rank == 0)
|
||||
{
|
||||
memcpy(result.ptr, tempInputCpu.ptr, numBytes);
|
||||
}
|
||||
else
|
||||
{
|
||||
CHECK_CALL(result.Reduce(collArgs.dataType, collArgs.numInputElements,
|
||||
tempInputCpu, tempOp));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Perform averaging if necessary
|
||||
if (collArgs.redOp == ncclAvg && (isAllReduce || collArgs.root == collArgs.globalRank))
|
||||
{
|
||||
CHECK_CALL(result.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode DefaultPrepData_Gather(CollectiveArgs &collArgs, bool const isAllGather)
|
||||
{
|
||||
CHECK_CALL(CheckAllocation(collArgs));
|
||||
if (collArgs.totalRanks * collArgs.numInputElements != collArgs.numOutputElements)
|
||||
{
|
||||
ERROR("# of output elements must be total ranks * # input elements for AllGather\n");
|
||||
return TEST_FAIL;
|
||||
}
|
||||
|
||||
// Clear output for all ranks (done before filling input in case of in-place)
|
||||
size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
|
||||
size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
|
||||
CHECK_CALL(collArgs.inputGpu.ClearGpuMem(numInputBytes));
|
||||
CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numOutputBytes));
|
||||
|
||||
PtrUnion result;
|
||||
CHECK_CALL(result.Attach(collArgs.expected.ptr));
|
||||
CHECK_CALL(result.ClearCpuMem(numOutputBytes));
|
||||
|
||||
// Use outputCpu buffer to store temporary input
|
||||
PtrUnion tempInputCpu;
|
||||
CHECK_CALL(tempInputCpu.Attach(collArgs.outputCpu.ptr));
|
||||
|
||||
for (int rank = 0; rank < collArgs.totalRanks; ++rank)
|
||||
{
|
||||
CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
|
||||
if (rank == collArgs.globalRank)
|
||||
{
|
||||
CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numInputBytes, hipMemcpyHostToDevice));
|
||||
}
|
||||
if (isAllGather || collArgs.root == collArgs.globalRank)
|
||||
{
|
||||
memcpy(result.I1 + (rank * numInputBytes), tempInputCpu.ptr, numInputBytes);
|
||||
}
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode DefaultPrepData_ReduceScatter(CollectiveArgs &collArgs)
|
||||
{
|
||||
CHECK_CALL(CheckAllocation(collArgs));
|
||||
if (collArgs.numInputElements != collArgs.numOutputElements * collArgs.totalRanks)
|
||||
{
|
||||
ERROR("# of input elements must be total ranks * # output elements for ReduceScatter\n");
|
||||
return TEST_FAIL;
|
||||
}
|
||||
|
||||
size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
|
||||
size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
|
||||
|
||||
// Clear output for all ranks (done before filling input in case of in-place)
|
||||
CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numOutputBytes));
|
||||
|
||||
PtrUnion tempInputCpu;
|
||||
PtrUnion tempResultCpu;
|
||||
|
||||
CHECK_CALL(tempInputCpu.AllocateCpuMem(numInputBytes));
|
||||
CHECK_CALL(tempResultCpu.AllocateCpuMem(numInputBytes));
|
||||
CHECK_CALL(tempResultCpu.ClearCpuMem(numInputBytes));
|
||||
|
||||
// If average or custom reduction operator is used, perform a summation instead
|
||||
ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp);
|
||||
|
||||
// Loop over each rank and generate the input / scale / reduce
|
||||
PtrUnion scalarsPerRank;
|
||||
scalarsPerRank.Attach(collArgs.scalarTransport.ptr);
|
||||
for (int rank = 0; rank < collArgs.totalRanks; ++rank)
|
||||
{
|
||||
CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
|
||||
|
||||
if (rank == collArgs.globalRank)
|
||||
{
|
||||
if (hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numInputBytes, hipMemcpyHostToDevice) != hipSuccess)
|
||||
{
|
||||
ERROR("hipMemcpy to input failed\n");
|
||||
CHECK_CALL(tempInputCpu.FreeCpuMem());
|
||||
CHECK_CALL(tempResultCpu.FreeCpuMem());
|
||||
return TEST_FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
// Scale the temporary input by local scalar for this rank
|
||||
// (Used by custom reduction ops)
|
||||
if (collArgs.scalarMode >= 0)
|
||||
{
|
||||
CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements,
|
||||
scalarsPerRank, rank));
|
||||
}
|
||||
|
||||
if (rank == 0)
|
||||
{
|
||||
memcpy(tempResultCpu.ptr, tempInputCpu.ptr, numInputBytes);
|
||||
}
|
||||
else
|
||||
{
|
||||
CHECK_CALL(tempResultCpu.Reduce(collArgs.dataType, collArgs.numInputElements,
|
||||
tempInputCpu, tempOp));
|
||||
}
|
||||
}
|
||||
|
||||
// Perform averaging if necessary
|
||||
if (collArgs.redOp == ncclAvg)
|
||||
{
|
||||
CHECK_CALL(tempResultCpu.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
|
||||
}
|
||||
|
||||
// Copy over portion of result
|
||||
memcpy(collArgs.expected.I1,
|
||||
tempResultCpu.I1 + collArgs.globalRank * numOutputBytes,
|
||||
numOutputBytes);
|
||||
CHECK_CALL(tempInputCpu.FreeCpuMem());
|
||||
CHECK_CALL(tempResultCpu.FreeCpuMem());
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode DefaultPrepData_Scatter(CollectiveArgs &collArgs)
|
||||
{
|
||||
CHECK_CALL(CheckAllocation(collArgs));
|
||||
if (collArgs.numInputElements != collArgs.numOutputElements * collArgs.totalRanks)
|
||||
{
|
||||
ERROR("# of input elements must be total ranks * # output elements for Scatter\n");
|
||||
return TEST_FAIL;
|
||||
}
|
||||
|
||||
size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
|
||||
size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
|
||||
|
||||
// Clear outputs on all ranks (prior to input in case of in-place)
|
||||
collArgs.outputGpu.ClearGpuMem(numOutputBytes);
|
||||
|
||||
// Generate input as if on root rank - each rank will receive a portion
|
||||
PtrUnion tempInput;
|
||||
tempInput.AllocateCpuMem(numInputBytes);
|
||||
tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.root, false);
|
||||
|
||||
// Copy input to root rank
|
||||
if (collArgs.globalRank == collArgs.root)
|
||||
{
|
||||
if (hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice) != hipSuccess)
|
||||
{
|
||||
ERROR("hipMemcpy to input failed\n");
|
||||
tempInput.FreeCpuMem();
|
||||
return TEST_FAIL;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
collArgs.inputGpu.ClearGpuMem(numInputBytes);
|
||||
}
|
||||
|
||||
// Each rank receive a portion of the input
|
||||
memcpy(collArgs.expected.U1, tempInput.U1 + (collArgs.globalRank * numOutputBytes), numOutputBytes);
|
||||
|
||||
tempInput.FreeCpuMem();
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode DefaultPrepData_AllToAll(CollectiveArgs &collArgs)
|
||||
{
|
||||
CHECK_CALL(CheckAllocation(collArgs));
|
||||
if (collArgs.numInputElements != collArgs.numOutputElements)
|
||||
{
|
||||
ERROR("Number of input elements must match number of output elements for AllToAll\n");
|
||||
return TEST_FAIL;
|
||||
}
|
||||
if (collArgs.numInputElements % collArgs.totalRanks)
|
||||
{
|
||||
ERROR("Input / Output size for AllToAll must be a multiple of %d\n", collArgs.totalRanks);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
|
||||
size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
|
||||
size_t const numBytes = numInputBytes / collArgs.totalRanks;
|
||||
|
||||
// Clear outputs on all ranks (prior to input in case of in-place)
|
||||
collArgs.outputGpu.ClearGpuMem(numOutputBytes);
|
||||
|
||||
// Generate input on root rank - each rank will receive a portion
|
||||
PtrUnion tempInput;
|
||||
tempInput.Attach(collArgs.outputCpu);
|
||||
|
||||
for (int rank = 0; rank < collArgs.totalRanks; ++rank)
|
||||
{
|
||||
tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false);
|
||||
|
||||
// Copy input
|
||||
if (rank == collArgs.globalRank)
|
||||
{
|
||||
CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice));
|
||||
}
|
||||
memcpy(collArgs.expected.U1 + (numBytes * rank), tempInput.U1 + (numBytes * collArgs.globalRank), numBytes);
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#pragma once
|
||||
#include "ErrCode.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
class CollectiveArgs;
|
||||
|
||||
// Checks that enough memory has been allocated
|
||||
ErrCode CheckAllocation(CollectiveArgs const& collArgs);
|
||||
|
||||
// Default PrepareData functions
|
||||
// PrepareData functions are responsible for setting up input / expected for the given collArgs
|
||||
ErrCode DefaultPrepareDataFunc(CollectiveArgs &collArgs);
|
||||
ErrCode DefaultPrepData_Broadcast(CollectiveArgs &collArgs);
|
||||
ErrCode DefaultPrepData_Reduce(CollectiveArgs &collArgs, bool const isAllReduce);
|
||||
ErrCode DefaultPrepData_Gather(CollectiveArgs &collArgs, bool const isAllGather);
|
||||
ErrCode DefaultPrepData_ReduceScatter(CollectiveArgs &collArgs);
|
||||
ErrCode DefaultPrepData_Scatter(CollectiveArgs &collArgs);
|
||||
ErrCode DefaultPrepData_AllToAll(CollectiveArgs &collArgs);
|
||||
ErrCode DefaultPrepData_SendRecv(CollectiveArgs &collArgs);
|
||||
}
|
||||
@@ -0,0 +1,354 @@
|
||||
#include "PtrUnion.hpp"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
size_t DataTypeToBytes(ncclDataType_t const dataType)
|
||||
{
|
||||
switch (dataType)
|
||||
{
|
||||
case ncclInt8: return 1;
|
||||
case ncclUint8: return 1;
|
||||
case ncclInt32: return 4;
|
||||
case ncclUint32: return 4;
|
||||
case ncclInt64: return 8;
|
||||
case ncclUint64: return 8;
|
||||
case ncclFloat16: return 2;
|
||||
case ncclFloat32: return 4;
|
||||
case ncclFloat64: return 8;
|
||||
case ncclBfloat16: return 2;
|
||||
default:
|
||||
ERROR("Unsupported datatype (%d)\n", dataType);
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
ErrCode PtrUnion::Attach(void *ptr)
|
||||
{
|
||||
this->ptr = ptr;
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode PtrUnion::Attach(PtrUnion ptrUnion)
|
||||
{
|
||||
this->ptr = ptrUnion.ptr;
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode PtrUnion::AllocateGpuMem(size_t const numBytes, bool const useManagedMem)
|
||||
{
|
||||
if (numBytes)
|
||||
{
|
||||
if (useManagedMem)
|
||||
{
|
||||
if (hipMallocManaged(&I1, numBytes) != hipSuccess)
|
||||
{
|
||||
ERROR("Unable to allocate managed memory of GPU memory (%lu bytes)\n", numBytes);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (hipMalloc(&I1, numBytes) != hipSuccess)
|
||||
{
|
||||
ERROR("Unable to allocate memory of GPU memory (%lu bytes)\n", numBytes);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
}
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode PtrUnion::AllocateCpuMem(size_t const numBytes)
|
||||
{
|
||||
if (numBytes)
|
||||
{
|
||||
this->ptr = calloc(numBytes, 1);
|
||||
if (!ptr)
|
||||
{
|
||||
ERROR("Unable to allocate memory (%lu bytes)\n", numBytes);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode PtrUnion::FreeGpuMem()
|
||||
{
|
||||
if (this->ptr != nullptr)
|
||||
{
|
||||
hipFree(this->ptr);
|
||||
this->ptr = nullptr;
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode PtrUnion::FreeCpuMem()
|
||||
{
|
||||
if (this->ptr != nullptr)
|
||||
{
|
||||
free(this->ptr);
|
||||
this->ptr = nullptr;
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode PtrUnion::ClearGpuMem(size_t const numBytes)
|
||||
{
|
||||
if (hipMemset(this->ptr, 0, numBytes) != hipSuccess)
|
||||
{
|
||||
ERROR("Unable to call hipMemset\n");
|
||||
return TEST_FAIL;
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode PtrUnion::ClearCpuMem(size_t const numBytes)
|
||||
{
|
||||
memset(this->ptr, 0, numBytes);
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode PtrUnion::FillPattern(ncclDataType_t const dataType,
|
||||
size_t const numElements,
|
||||
int const globalRank,
|
||||
bool const isGpuMem)
|
||||
{
|
||||
PtrUnion temp;
|
||||
size_t const numBytes = numElements * DataTypeToBytes(dataType);
|
||||
|
||||
// If this is GPU memory, create a CPU temp buffer otherwise fill CPU memory directly
|
||||
if (isGpuMem)
|
||||
temp.AllocateCpuMem(numBytes);
|
||||
else
|
||||
temp.Attach(this->ptr);
|
||||
|
||||
for (int i = 0; i < numElements; i++)
|
||||
{
|
||||
int valueI = (globalRank + i) % 256;
|
||||
double valueF = 1.0L/((double)valueI+1.0L);
|
||||
temp.Set(dataType, i, valueI, valueF);
|
||||
}
|
||||
|
||||
// If this is GPU memory, copy from CPU temp buffer
|
||||
if (isGpuMem)
|
||||
{
|
||||
if (hipMemcpy(this->ptr, temp.ptr, numBytes, hipMemcpyHostToDevice) != hipSuccess)
|
||||
{
|
||||
ERROR("Unable to fill input with pattern for rank %d\n", globalRank);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
temp.FreeCpuMem();
|
||||
}
|
||||
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode PtrUnion::Set(ncclDataType_t const dataType, int const idx, int valueI, double valueF)
|
||||
{
|
||||
switch (dataType)
|
||||
{
|
||||
case ncclInt8: I1[idx] = valueI; break;
|
||||
case ncclUint8: U1[idx] = valueI; break;
|
||||
case ncclInt32: I4[idx] = valueI; break;
|
||||
case ncclUint32: U4[idx] = valueI; break;
|
||||
case ncclInt64: I8[idx] = valueI; break;
|
||||
case ncclUint64: U8[idx] = valueI; break;
|
||||
case ncclFloat32: F4[idx] = valueF; break;
|
||||
case ncclFloat64: F8[idx] = valueF; break;
|
||||
case ncclBfloat16: B2[idx] = rccl_bfloat16(static_cast<float>(valueF)); break;
|
||||
default:
|
||||
ERROR("Unsupported datatype\n");
|
||||
return TEST_FAIL;
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode PtrUnion::Get(ncclDataType_t const dataType, int const idx, int& valueI, double& valueF) const
|
||||
{
|
||||
switch (dataType)
|
||||
{
|
||||
case ncclInt8: valueI = I1[idx]; break;
|
||||
case ncclUint8: valueI = I1[idx]; break;
|
||||
case ncclInt32: valueI = I4[idx]; break;
|
||||
case ncclUint32: valueI = U4[idx]; break;
|
||||
case ncclInt64: valueI = I8[idx]; break;
|
||||
case ncclUint64: valueI = U8[idx]; break;
|
||||
case ncclFloat32: valueF = F4[idx]; break;
|
||||
case ncclFloat64: valueF = F8[idx]; break;
|
||||
case ncclBfloat16: valueF = B2[idx]; break;
|
||||
default:
|
||||
ERROR("Unsupported datatype\n");
|
||||
return TEST_FAIL;
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
// Multiplies in-place each element by scalarsPerRank[rank]
|
||||
ErrCode PtrUnion::Scale(ncclDataType_t const dataType,
|
||||
size_t const numElements,
|
||||
PtrUnion const& scalarsPerRank,
|
||||
int const rank)
|
||||
{
|
||||
// If no scalars are provided do nothing
|
||||
if (scalarsPerRank.ptr == nullptr) return TEST_SUCCESS;
|
||||
|
||||
for (size_t idx = 0; idx < numElements; ++idx)
|
||||
{
|
||||
switch (dataType)
|
||||
{
|
||||
case ncclInt8: I1[idx] *= scalarsPerRank.I1[rank]; break;
|
||||
case ncclUint8: U1[idx] *= scalarsPerRank.U1[rank]; break;
|
||||
case ncclInt32: I4[idx] *= scalarsPerRank.I4[rank]; break;
|
||||
case ncclUint32: U4[idx] *= scalarsPerRank.U4[rank]; break;
|
||||
case ncclInt64: I8[idx] *= scalarsPerRank.I8[rank]; break;
|
||||
case ncclUint64: U8[idx] *= scalarsPerRank.U8[rank]; break;
|
||||
case ncclFloat32: F4[idx] *= scalarsPerRank.F4[rank]; break;
|
||||
case ncclFloat64: F8[idx] *= scalarsPerRank.F8[rank]; break;
|
||||
case ncclBfloat16: B2[idx] *= scalarsPerRank.B2[rank]; break;
|
||||
default:
|
||||
ERROR("Unsupported datatype\n");
|
||||
return TEST_FAIL;
|
||||
}
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode PtrUnion::Reduce(ncclDataType_t const dataType,
|
||||
size_t const numElements,
|
||||
PtrUnion const& inputCpu,
|
||||
ncclRedOp_t const op)
|
||||
{
|
||||
if (inputCpu.ptr == nullptr)
|
||||
{
|
||||
ERROR("Input pointer to Reduce should not be nullptr\n");
|
||||
return TEST_FAIL;
|
||||
}
|
||||
|
||||
for (size_t idx = 0; idx < numElements; ++idx)
|
||||
{
|
||||
switch (dataType)
|
||||
{
|
||||
case ncclInt8: I1[idx] = ReduceOp(op, I1[idx], inputCpu.I1[idx]); break;
|
||||
case ncclUint8: U1[idx] = ReduceOp(op, U1[idx], inputCpu.U1[idx]); break;
|
||||
case ncclInt32: I4[idx] = ReduceOp(op, I4[idx], inputCpu.I4[idx]); break;
|
||||
case ncclUint32: U4[idx] = ReduceOp(op, U4[idx], inputCpu.U4[idx]); break;
|
||||
case ncclInt64: I8[idx] = ReduceOp(op, I8[idx], inputCpu.I8[idx]); break;
|
||||
case ncclUint64: U8[idx] = ReduceOp(op, U8[idx], inputCpu.U8[idx]); break;
|
||||
case ncclFloat32: F4[idx] = ReduceOp(op, F4[idx], inputCpu.F4[idx]); break;
|
||||
case ncclFloat64: F8[idx] = ReduceOp(op, F8[idx], inputCpu.F8[idx]); break;
|
||||
case ncclBfloat16: B2[idx] = ReduceOp(op, B2[idx], inputCpu.B2[idx]); break;
|
||||
default:
|
||||
ERROR("Unsupported datatype\n");
|
||||
return TEST_FAIL;
|
||||
}
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
ErrCode PtrUnion::DivideByInt(ncclDataType_t const dataType,
|
||||
size_t const numElements,
|
||||
int const divisor)
|
||||
{
|
||||
for (size_t idx = 0; idx < numElements; ++idx)
|
||||
{
|
||||
switch (dataType)
|
||||
{
|
||||
case ncclInt8: I1[idx] /= divisor; break;
|
||||
case ncclUint8: U1[idx] /= divisor; break;
|
||||
case ncclInt32: I4[idx] /= divisor; break;
|
||||
case ncclUint32: U4[idx] /= divisor; break;
|
||||
case ncclInt64: I8[idx] /= divisor; break;
|
||||
case ncclUint64: U8[idx] /= divisor; break;
|
||||
case ncclFloat32: F4[idx] /= divisor; break;
|
||||
case ncclFloat64: F8[idx] /= divisor; break;
|
||||
case ncclBfloat16: B2[idx] = (rccl_bfloat16((float)(B2[idx]) / divisor)); break;
|
||||
default:
|
||||
ERROR("Unsupported datatype\n");
|
||||
return TEST_FAIL;
|
||||
}
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode PtrUnion::IsEqual(ncclDataType_t const dataType,
|
||||
size_t const numElements,
|
||||
PtrUnion const& expected,
|
||||
bool const verbose,
|
||||
bool& isMatch)
|
||||
{
|
||||
isMatch = true;
|
||||
size_t idx = 0;
|
||||
for (idx = 0; idx < numElements; ++idx)
|
||||
{
|
||||
switch (dataType)
|
||||
{
|
||||
case ncclInt8: isMatch = (I1[idx] == expected.I1[idx]); break;
|
||||
case ncclUint8: isMatch = (U1[idx] == expected.U1[idx]); break;
|
||||
case ncclInt32: isMatch = (I4[idx] == expected.I4[idx]); break;
|
||||
case ncclUint32: isMatch = (U4[idx] == expected.U4[idx]); break;
|
||||
case ncclInt64: isMatch = (I8[idx] == expected.I8[idx]); break;
|
||||
case ncclUint64: isMatch = (U8[idx] == expected.U8[idx]); break;
|
||||
case ncclFloat32: isMatch = (fabs(F4[idx] - expected.F4[idx]) < 1e-5); break;
|
||||
case ncclFloat64: isMatch = (fabs(F8[idx] - expected.F8[idx]) < 1e-12); break;
|
||||
case ncclBfloat16: isMatch = (fabs((float)B2[idx] - (float)expected.B2[idx]) < 9e-2); break;
|
||||
default:
|
||||
ERROR("Unsupported datatype\n");
|
||||
return TEST_FAIL;
|
||||
}
|
||||
if (!isMatch) break;
|
||||
}
|
||||
|
||||
if (verbose && !isMatch)
|
||||
{
|
||||
switch (dataType)
|
||||
{
|
||||
case ncclInt8:
|
||||
ERROR("Expected output: %d. Actual output: %d at index %lu\n", expected.I1[idx], I1[idx], idx); break;
|
||||
case ncclUint8:
|
||||
ERROR("Expected output: %u. Actual output: %u at index %lu\n", expected.U1[idx], U1[idx], idx); break;
|
||||
case ncclInt32:
|
||||
ERROR("Expected output: %d. Actual output: %d at index %lu\n", expected.I4[idx], I4[idx], idx); break;
|
||||
case ncclUint32:
|
||||
ERROR("Expected output: %u. Actual output: %u at index %lu\n", expected.U4[idx], U4[idx], idx); break;
|
||||
case ncclInt64:
|
||||
ERROR("Expected output: %ld. Actual output: %ld at index %lu\n", expected.I8[idx], I8[idx], idx); break;
|
||||
case ncclUint64:
|
||||
ERROR("Expected output: %lu. Actual output: %lu at index %lu\n", expected.U8[idx], U8[idx], idx); break;
|
||||
case ncclFloat32:
|
||||
ERROR("Expected output: %f. Actual output: %f at index %lu\n", expected.F4[idx], F4[idx], idx); break;
|
||||
case ncclFloat64:
|
||||
ERROR("Expected output: %lf. Actual output: %lf at index %lu\n", expected.F8[idx], F8[idx], idx); break;
|
||||
case ncclBfloat16:
|
||||
ERROR("Expected output: %f. Actual output: %f at index %lu\n", (float)expected.B2[idx], (float)B2[idx], idx); break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
std::string PtrUnion::ToString(ncclDataType_t const dataType,
|
||||
size_t const numElements) const
|
||||
{
|
||||
std::stringstream ss;
|
||||
for (int i = 0; i < numElements; i++)
|
||||
{
|
||||
if (i) ss << " ";
|
||||
switch (dataType)
|
||||
{
|
||||
case ncclInt8: ss << I1[i]; break;
|
||||
case ncclUint8: ss << U1[i]; break;
|
||||
case ncclInt32: ss << I4[i]; break;
|
||||
case ncclUint32: ss << U4[i]; break;
|
||||
case ncclInt64: ss << I8[i]; break;
|
||||
case ncclUint64: ss << U8[i]; break;
|
||||
case ncclFloat32: ss << F4[i]; break;
|
||||
case ncclFloat64: ss << F8[i]; break;
|
||||
case ncclBfloat16: ss << (float)B2[i]; break;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
return ss.str();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,90 @@
|
||||
#pragma once
|
||||
#include "ErrCode.hpp"
|
||||
#include "rccl.h"
|
||||
#include "rccl_bfloat16.h"
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
// Performs the various basic reduction operations
|
||||
template <typename T>
|
||||
T ReduceOp(ncclRedOp_t const op, T const A, T const B)
|
||||
{
|
||||
switch (op)
|
||||
{
|
||||
case ncclSum: return A + B;
|
||||
case ncclProd: return A * B;
|
||||
case ncclMax: return std::max(A, B);
|
||||
case ncclMin: return std::min(A, B);
|
||||
default:
|
||||
ERROR("Unsupported reduction operator (%d)\n", op);
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
size_t DataTypeToBytes(ncclDataType_t const dataType);
|
||||
|
||||
// PtrUnion encapsulates a pointer of all the different supported datatypes
|
||||
// NOTE: Currently half-precision float tests are unsupported due to half
|
||||
// being supported on GPU only and not host
|
||||
union PtrUnion
|
||||
{
|
||||
void* ptr;
|
||||
int8_t* I1; // ncclInt8
|
||||
uint8_t* U1; // ncclUint8
|
||||
int32_t* I4; // ncclInt32
|
||||
uint32_t* U4; // ncclUint32
|
||||
int64_t* I8; // ncclInt64
|
||||
uint64_t* U8; // ncclUint64
|
||||
float* F4; // ncclFloat32
|
||||
double* F8; // ncclFloat64
|
||||
rccl_bfloat16* B2; // ncclBfloat16
|
||||
|
||||
ErrCode Attach(void *ptr);
|
||||
ErrCode Attach(PtrUnion ptrUnion);
|
||||
|
||||
ErrCode AllocateGpuMem(size_t const numBytes, bool const useManagedMem = false);
|
||||
ErrCode AllocateCpuMem(size_t const numBytes);
|
||||
|
||||
ErrCode FreeGpuMem();
|
||||
ErrCode FreeCpuMem();
|
||||
|
||||
ErrCode ClearGpuMem(size_t const numBytes);
|
||||
ErrCode ClearCpuMem(size_t const numBytes);
|
||||
|
||||
ErrCode FillPattern(ncclDataType_t const dataType,
|
||||
size_t const numElements,
|
||||
int const globalRank,
|
||||
bool const isGpuMem);
|
||||
|
||||
ErrCode Set(ncclDataType_t const dataType, int const idx, int valueI, double valueF);
|
||||
ErrCode Get(ncclDataType_t const dataType, int const idx, int& valueI, double& valueF) const;
|
||||
|
||||
// Multiplies in-place each element by scalarsPerRank[rank]
|
||||
ErrCode Scale(ncclDataType_t const dataType,
|
||||
size_t const numElements,
|
||||
PtrUnion const& scalarsPerRank,
|
||||
int const rank);
|
||||
|
||||
// Reduces input into this PtrUnion
|
||||
ErrCode Reduce(ncclDataType_t const dataType,
|
||||
size_t const numElements,
|
||||
PtrUnion const& inputCpu,
|
||||
ncclRedOp_t const op);
|
||||
|
||||
// Divide each element by a integer value
|
||||
ErrCode DivideByInt(ncclDataType_t const dataType,
|
||||
size_t const numElements,
|
||||
int const divisor);
|
||||
|
||||
// Compares for equality (fuzzy comparision for floating point types)
|
||||
ErrCode IsEqual(ncclDataType_t const dataType,
|
||||
size_t const numElements,
|
||||
PtrUnion const& expected,
|
||||
bool const verbose,
|
||||
bool& isMatch);
|
||||
|
||||
// Output to string (for debug)
|
||||
std::string ToString(ncclDataType_t const dataType,
|
||||
size_t const numElements) const;
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,485 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include <unistd.h>
|
||||
#include "TestBed.hpp"
|
||||
#include <rccl.h>
|
||||
|
||||
#define PIPE_WRITE(childId, val) \
|
||||
ASSERT_EQ(write(childList[childId]->parentWriteFd, &val, sizeof(val)), sizeof(val))
|
||||
|
||||
#define PIPE_CHECK(childId) \
|
||||
{ \
|
||||
int response = 0; \
|
||||
ASSERT_EQ(read(childList[childId]->parentReadFd, &response, sizeof(int)), sizeof(int)); \
|
||||
ASSERT_EQ(response, TEST_SUCCESS); \
|
||||
}
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TestBed::TestBed() :
|
||||
numDevicesAvailable(0),
|
||||
numActiveChildren(0),
|
||||
numActiveRanks(0)
|
||||
{
|
||||
// Set NCCL_COMM_ID to use a local port to avoid passing ncclCommId
|
||||
// Calling ncclGetUniqueId would initialize HIP, which should not be done prior to fork
|
||||
std::string localPort = "55513";
|
||||
if (!getenv("NCCL_COMM_ID"))
|
||||
{
|
||||
char hostname[HOST_NAME_MAX+1];
|
||||
gethostname(hostname, HOST_NAME_MAX+1);
|
||||
std::string hostnameString(hostname);
|
||||
hostnameString.append(":55513");
|
||||
setenv("NCCL_COMM_ID", hostnameString.c_str(), 0);
|
||||
if (ev.verbose) INFO("NCCL_COMM_ID set to %s\n", hostnameString.c_str());
|
||||
}
|
||||
|
||||
// Collect the number of GPUs
|
||||
this->numDevicesAvailable = ev.maxGpus;
|
||||
if (ev.verbose) INFO("Detected %d GPUs\n", this->numDevicesAvailable);
|
||||
|
||||
// Create the maximum number of possible child processes (1 per GPU)
|
||||
// Parent and child communicate via pipes
|
||||
childList.resize(this->numDevicesAvailable);
|
||||
for (int childId = 0; childId < this->numDevicesAvailable; ++childId)
|
||||
{
|
||||
childList[childId] = new TestBedChild(childId, ev.verbose, ev.printValues);
|
||||
if (childList[childId]->InitPipes() != TEST_SUCCESS)
|
||||
{
|
||||
ERROR("Unable to create pipes to child process\n");
|
||||
return;
|
||||
}
|
||||
|
||||
pid_t pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
// Child process enters execution loop
|
||||
childList[childId]->StartExecutionLoop();
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Parent records child process ID and closes unused ends of pipe
|
||||
childList[childId]->pid = pid;
|
||||
close(childList[childId]->childWriteFd);
|
||||
close(childList[childId]->childReadFd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestBed::InitComms(std::vector<std::vector<int>> const& deviceIdsPerProcess,
|
||||
int const numCollectivesInGroup)
|
||||
{
|
||||
// Count up the total number of GPUs to use and track child/deviceId per rank
|
||||
this->numActiveChildren = deviceIdsPerProcess.size();
|
||||
this->numActiveRanks = 0;
|
||||
this->numCollectivesInGroup = numCollectivesInGroup;
|
||||
this->rankToChildMap.clear();
|
||||
this->rankToDeviceMap.clear();
|
||||
if (ev.verbose) INFO("Setting up %d active child processes\n", this->numActiveChildren);
|
||||
for (int childId = 0; childId < this->numActiveChildren; ++childId)
|
||||
{
|
||||
for (auto i = 0; i < deviceIdsPerProcess[childId].size(); ++i)
|
||||
{
|
||||
this->rankToChildMap.push_back(childId);
|
||||
this->rankToDeviceMap.push_back(deviceIdsPerProcess[childId][i]);
|
||||
++this->numActiveRanks;
|
||||
}
|
||||
}
|
||||
|
||||
// Send InitComms command to each active child process
|
||||
int const cmd = TestBedChild::CHILD_INIT_COMMS;
|
||||
int rankOffset = 0;
|
||||
for (int childId = 0; childId < this->numActiveChildren; ++childId)
|
||||
{
|
||||
PIPE_WRITE(childId, cmd);
|
||||
|
||||
// Send total number of ranks to child process
|
||||
PIPE_WRITE(childId, this->numActiveRanks);
|
||||
|
||||
// Send the rank offset for this child process
|
||||
PIPE_WRITE(childId, rankOffset);
|
||||
|
||||
// Send the number of collectives to be run per group call
|
||||
PIPE_WRITE(childId, numCollectivesInGroup);
|
||||
|
||||
// Send the GPUs this child uses
|
||||
int const numGpus = deviceIdsPerProcess[childId].size();
|
||||
PIPE_WRITE(childId, numGpus);
|
||||
for (int i = 0; i < numGpus; i++)
|
||||
PIPE_WRITE(childId, deviceIdsPerProcess[childId][i]);
|
||||
|
||||
rankOffset += numGpus;
|
||||
}
|
||||
|
||||
// Wait for child acknowledgement
|
||||
// This is done after previous loop to avoid deadlock as every rank needs to enter ncclInitCommRank
|
||||
for (int childId = 0; childId < this->numActiveChildren; ++childId)
|
||||
{
|
||||
PIPE_CHECK(childId);
|
||||
}
|
||||
}
|
||||
|
||||
void TestBed::InitComms(int const numGpus, int const numCollectivesInGroup)
|
||||
{
|
||||
InitComms(TestBed::GetDeviceIdsList(1, numGpus), numCollectivesInGroup);
|
||||
}
|
||||
|
||||
void TestBed::SetCollectiveArgs(ncclFunc_t const funcType,
|
||||
ncclDataType_t const dataType,
|
||||
ncclRedOp_t const redOp,
|
||||
int const root,
|
||||
size_t const numInputElements,
|
||||
size_t const numOutputElements,
|
||||
int const collId,
|
||||
int const rank,
|
||||
PtrUnion const scalarsPerRank,
|
||||
int const scalarMode)
|
||||
{
|
||||
// Build list of ranks this applies to (-1 for rank means to set for all)
|
||||
std::vector<int> rankList;
|
||||
for (int i = 0; i < this->numActiveRanks; ++i)
|
||||
if (rank == -1 || rank == i) rankList.push_back(i);
|
||||
|
||||
ScalarTransport scalarTransport;
|
||||
if (scalarMode >= 0)
|
||||
{
|
||||
ASSERT_TRUE(scalarsPerRank.ptr != NULL);
|
||||
|
||||
// Capture scalars per rank in format to share with child processes
|
||||
int const numBytes = this->numActiveRanks * DataTypeToBytes(dataType);
|
||||
memcpy(scalarTransport.ptr, scalarsPerRank.ptr, numBytes);
|
||||
}
|
||||
|
||||
// Loop over all ranks and send CollectiveArgs to appropriate child process
|
||||
int const cmd = TestBedChild::CHILD_SET_COLL_ARGS;
|
||||
for (auto currRank : rankList)
|
||||
{
|
||||
int const childId = rankToChildMap[currRank];
|
||||
PIPE_WRITE(childId, cmd);
|
||||
PIPE_WRITE(childId, currRank);
|
||||
PIPE_WRITE(childId, collId);
|
||||
PIPE_WRITE(childId, funcType);
|
||||
PIPE_WRITE(childId, dataType);
|
||||
PIPE_WRITE(childId, redOp);
|
||||
PIPE_WRITE(childId, root);
|
||||
PIPE_WRITE(childId, numInputElements);
|
||||
PIPE_WRITE(childId, numOutputElements);
|
||||
PIPE_WRITE(childId, scalarMode);
|
||||
PIPE_WRITE(childId, scalarTransport);
|
||||
PIPE_CHECK(childId);
|
||||
}
|
||||
}
|
||||
|
||||
void TestBed::AllocateMem(bool const inPlace,
|
||||
bool const useManagedMem,
|
||||
int const collId,
|
||||
int const rank)
|
||||
{
|
||||
// Build list of ranks this applies to (-1 for rank means to set for all)
|
||||
std::vector<int> rankList;
|
||||
for (int i = 0; i < this->numActiveRanks; ++i)
|
||||
if (rank == -1 || rank == i) rankList.push_back(i);
|
||||
|
||||
// Loop over all ranks and send allocation command to appropriate child process
|
||||
int const cmd = TestBedChild::CHILD_ALLOCATE_MEM;
|
||||
for (auto currRank : rankList)
|
||||
{
|
||||
int const childId = rankToChildMap[currRank];
|
||||
PIPE_WRITE(childId, cmd);
|
||||
PIPE_WRITE(childId, currRank);
|
||||
PIPE_WRITE(childId, collId);
|
||||
PIPE_WRITE(childId, inPlace);
|
||||
PIPE_WRITE(childId, useManagedMem);
|
||||
PIPE_CHECK(childId);
|
||||
}
|
||||
}
|
||||
|
||||
void TestBed::PrepareData(int const collId,
|
||||
int const rank,
|
||||
CollFuncPtr const prepDataFunc)
|
||||
{
|
||||
// Build list of ranks this applies to (-1 for rank means to set for all)
|
||||
std::vector<int> rankList;
|
||||
for (int i = 0; i < this->numActiveRanks; ++i)
|
||||
if (rank == -1 || rank == i) rankList.push_back(i);
|
||||
|
||||
// Loop over all ranks and send prepare data command to appropriate child process
|
||||
int const cmd = TestBedChild::CHILD_PREPARE_DATA;
|
||||
for (auto currRank : rankList)
|
||||
{
|
||||
int const childId = rankToChildMap[currRank];
|
||||
PIPE_WRITE(childId, cmd);
|
||||
PIPE_WRITE(childId, currRank);
|
||||
PIPE_WRITE(childId, collId);
|
||||
PIPE_WRITE(childId, prepDataFunc);
|
||||
PIPE_CHECK(childId);
|
||||
}
|
||||
}
|
||||
|
||||
void TestBed::ExecuteCollectives()
|
||||
{
|
||||
int const cmd = TestBedChild::CHILD_EXECUTE_COLL;
|
||||
++TestBed::NumTestsRun();
|
||||
|
||||
// Send ExecuteColl command to each active child process
|
||||
for (int childId = 0; childId < this->numActiveChildren; ++childId)
|
||||
{
|
||||
PIPE_WRITE(childId, cmd);
|
||||
}
|
||||
|
||||
// Wait for child acknowledgement
|
||||
for (int childId = 0; childId < this->numActiveChildren; ++childId)
|
||||
{
|
||||
PIPE_CHECK(childId);
|
||||
}
|
||||
}
|
||||
|
||||
void TestBed::ValidateResults(bool& isCorrect, int const collId, int const rank)
|
||||
{
|
||||
// Build list of ranks this applies to (-1 for rank means to set for all)
|
||||
std::vector<int> rankList;
|
||||
for (int i = 0; i < this->numActiveRanks; ++i)
|
||||
if (rank == -1 || rank == i) rankList.push_back(i);
|
||||
|
||||
int const cmd = TestBedChild::CHILD_VALIDATE_RESULTS;
|
||||
|
||||
isCorrect = true;
|
||||
// Send ValidateResults command to each active child process
|
||||
for (auto currRank : rankList)
|
||||
{
|
||||
int const childId = rankToChildMap[currRank];
|
||||
PIPE_WRITE(childId, cmd);
|
||||
PIPE_WRITE(childId, currRank);
|
||||
PIPE_WRITE(childId, collId);
|
||||
|
||||
int response = 0;
|
||||
ASSERT_EQ(read(childList[childId]->parentReadFd, &response, sizeof(int)), sizeof(int));
|
||||
isCorrect &= (response == TEST_SUCCESS);
|
||||
}
|
||||
|
||||
ASSERT_EQ(isCorrect, true) << "Output does not match expected";
|
||||
}
|
||||
|
||||
void TestBed::DeallocateMem(int const collId, int const rank)
|
||||
{
|
||||
// Build list of ranks this applies to (-1 for rank means to set for all)
|
||||
std::vector<int> rankList;
|
||||
for (int i = 0; i < this->numActiveRanks; ++i)
|
||||
if (rank == -1 || rank == i) rankList.push_back(i);
|
||||
|
||||
int const cmd = TestBedChild::CHILD_DEALLOCATE_MEM;
|
||||
|
||||
for (auto currRank : rankList)
|
||||
{
|
||||
int const childId = rankToChildMap[currRank];
|
||||
PIPE_WRITE(childId, cmd);
|
||||
PIPE_WRITE(childId, currRank);
|
||||
PIPE_WRITE(childId, collId);
|
||||
PIPE_CHECK(childId);
|
||||
}
|
||||
}
|
||||
|
||||
void TestBed::DestroyComms()
|
||||
{
|
||||
int const cmd = TestBedChild::CHILD_DESTROY_COMMS;
|
||||
for (int childId = 0; childId < this->numActiveChildren; ++childId)
|
||||
{
|
||||
// Send DestroyComms command to each active child process
|
||||
PIPE_WRITE(childId, cmd);
|
||||
|
||||
// Wait for child acknowledgement
|
||||
PIPE_CHECK(childId);
|
||||
}
|
||||
|
||||
// Reset bookkeeping
|
||||
this->numActiveChildren = 0;
|
||||
this->numActiveRanks = 0;
|
||||
this->numCollectivesInGroup = 0;
|
||||
}
|
||||
|
||||
void TestBed::Finalize()
|
||||
{
|
||||
// Send Stop to all child processes
|
||||
int const cmd = TestBedChild::CHILD_STOP;
|
||||
for (int childId = 0; childId < this->numDevicesAvailable; ++childId)
|
||||
{
|
||||
PIPE_WRITE(childId, cmd);
|
||||
|
||||
// Close pipes to child process
|
||||
close(childList[childId]->parentWriteFd);
|
||||
close(childList[childId]->parentReadFd);
|
||||
}
|
||||
this->numDevicesAvailable = 0;
|
||||
}
|
||||
|
||||
TestBed::~TestBed()
|
||||
{
|
||||
Finalize();
|
||||
}
|
||||
|
||||
std::vector<ncclRedOp_t> const& TestBed::GetAllSupportedRedOps()
|
||||
{
|
||||
return ev.GetAllSupportedRedOps();
|
||||
}
|
||||
|
||||
std::vector<ncclDataType_t> const& TestBed::GetAllSupportedDataTypes()
|
||||
{
|
||||
return ev.GetAllSupportedDataTypes();
|
||||
}
|
||||
|
||||
std::vector<std::vector<int>> TestBed::GetDeviceIdsList(int const numProcesses,
|
||||
int const numGpus)
|
||||
{
|
||||
std::vector<std::vector<int>> result(numProcesses);
|
||||
for (int i = 0; i < numGpus; i++)
|
||||
result[i % numProcesses].push_back(i);
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string TestBed::GetTestCaseName(int const totalRanks,
|
||||
bool const isMultiProcess,
|
||||
ncclFunc_t const funcType,
|
||||
ncclDataType_t const dataType,
|
||||
ncclRedOp_t const redOp,
|
||||
int const root,
|
||||
bool const inPlace,
|
||||
bool const managedMem)
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << (isMultiProcess ? "MP" : "SP") << " ";
|
||||
ss << totalRanks << " ranks ";
|
||||
ss << ncclFuncNames[funcType] << " ";
|
||||
ss << "(" << (inPlace ? "IP" : "OP") << "," << (managedMem ? "MM" : "GM") << ") ";
|
||||
ss << ncclDataTypeNames[dataType] << " ";
|
||||
if (CollectiveArgs::UsesReduce(funcType)) ss << ncclRedOpNames[redOp] << " ";
|
||||
if (CollectiveArgs::UsesRoot(funcType)) ss << "Root " << root << " ";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
void TestBed::RunSimpleSweep(std::vector<ncclFunc_t> const& funcTypes,
|
||||
std::vector<ncclDataType_t> const& tmpDataTypes,
|
||||
std::vector<ncclRedOp_t> const& tmpRedOps,
|
||||
std::vector<int> const& roots,
|
||||
std::vector<int> const& numElements,
|
||||
std::vector<bool> const& inPlaceList,
|
||||
std::vector<bool> const& managedMemList)
|
||||
{
|
||||
// Sort numElements in descending order to cut down on # of allocations
|
||||
std::vector<int> sortedN = numElements;
|
||||
std::sort(sortedN.rbegin(), sortedN.rend());
|
||||
|
||||
// Filter out any unsupported datatypes, in case only subset has been compiled for
|
||||
std::vector<ncclDataType_t> const& supportedDataTypes = this->GetAllSupportedDataTypes();
|
||||
std::vector<ncclDataType_t> dataTypes;
|
||||
for (auto dt : tmpDataTypes)
|
||||
{
|
||||
for (int i = 0; i < supportedDataTypes.size(); ++i)
|
||||
{
|
||||
if (supportedDataTypes[i] == dt)
|
||||
{
|
||||
dataTypes.push_back(dt);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Filter out any unsupported reduction ops, in case only subset has been compiled for
|
||||
std::vector<ncclRedOp_t> const& supportedOps = this->GetAllSupportedRedOps();
|
||||
std::vector<ncclRedOp_t> redOps;
|
||||
for (auto redop : tmpRedOps)
|
||||
{
|
||||
for (int i = 0; i < supportedOps.size(); ++i)
|
||||
{
|
||||
if (supportedOps[i] == redop)
|
||||
{
|
||||
redOps.push_back(redop);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool isCorrect = true;
|
||||
|
||||
// Sweep over the number of ranks
|
||||
for (int totalRanks = ev.minGpus; totalRanks <= ev.maxGpus && isCorrect; ++totalRanks)
|
||||
for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
|
||||
{
|
||||
if (!(ev.processMask & (1 << isMultiProcess))) continue;
|
||||
|
||||
// Test either single process all GPUs, or 1 process per GPU
|
||||
int const numProcesses = isMultiProcess ? totalRanks : 1;
|
||||
this->InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
|
||||
|
||||
for (int ftIdx = 0; ftIdx < funcTypes.size() && isCorrect; ++ftIdx)
|
||||
for (int dtIdx = 0; dtIdx < dataTypes.size() && isCorrect; ++dtIdx)
|
||||
for (int rdIdx = 0; rdIdx < redOps.size() && isCorrect; ++rdIdx)
|
||||
for (int rtIdx = 0; rtIdx < roots.size() && isCorrect; ++rtIdx)
|
||||
for (int ipIdx = 0; ipIdx < inPlaceList.size() && isCorrect; ++ipIdx)
|
||||
for (int mmIdx = 0; mmIdx < managedMemList.size() && isCorrect; ++mmIdx)
|
||||
{
|
||||
if (ev.showNames)
|
||||
{
|
||||
std::string name = this->GetTestCaseName(totalRanks, isMultiProcess,
|
||||
funcTypes[ftIdx], dataTypes[dtIdx],
|
||||
redOps[rdIdx], roots[rtIdx],
|
||||
inPlaceList[ipIdx], managedMemList[mmIdx]);
|
||||
INFO("%s\n", name.c_str());
|
||||
}
|
||||
|
||||
for (int neIdx = 0; neIdx < numElements.size() && isCorrect; ++neIdx)
|
||||
{
|
||||
int numInputElements, numOutputElements;
|
||||
CollectiveArgs::GetNumElementsForFuncType(funcTypes[ftIdx],
|
||||
sortedN[neIdx],
|
||||
totalRanks,
|
||||
&numInputElements,
|
||||
&numOutputElements);
|
||||
|
||||
this->SetCollectiveArgs(funcTypes[ftIdx],
|
||||
dataTypes[dtIdx],
|
||||
redOps[rdIdx],
|
||||
roots[rtIdx],
|
||||
numInputElements,
|
||||
numOutputElements);
|
||||
|
||||
// Only allocate once for largest size
|
||||
if (neIdx == 0) this->AllocateMem(inPlaceList[ipIdx], managedMemList[mmIdx]);
|
||||
|
||||
// There are some cases when data does not need to be re-prepared
|
||||
// e.g. AllReduce subarray expected results are still valid
|
||||
bool canSkip = (neIdx != 0 && !inPlaceList[ipIdx] &&
|
||||
(funcTypes[ftIdx] == ncclCollBroadcast ||
|
||||
funcTypes[ftIdx] == ncclCollReduce ||
|
||||
funcTypes[ftIdx] == ncclCollAllReduce));
|
||||
if (!canSkip) this->PrepareData();
|
||||
|
||||
this->ExecuteCollectives();
|
||||
this->ValidateResults(isCorrect);
|
||||
if (!isCorrect)
|
||||
{
|
||||
std::string name = this->GetTestCaseName(totalRanks, isMultiProcess,
|
||||
funcTypes[ftIdx], dataTypes[dtIdx],
|
||||
redOps[rdIdx], roots[rtIdx],
|
||||
inPlaceList[ipIdx], managedMemList[mmIdx]);
|
||||
ERROR("Incorrect output for %s\n", name.c_str());
|
||||
}
|
||||
}
|
||||
this->DeallocateMem();
|
||||
}
|
||||
this->DestroyComms();
|
||||
}
|
||||
}
|
||||
|
||||
int& TestBed::NumTestsRun()
|
||||
{
|
||||
static int numTestsRun = 0;
|
||||
return numTestsRun;
|
||||
}
|
||||
}
|
||||
|
||||
#undef PIPE_WRITE
|
||||
#undef PIPE_CHECK
|
||||
@@ -0,0 +1,129 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#pragma once
|
||||
#include <map>
|
||||
#include "CollectiveArgs.hpp"
|
||||
#include "TestBedChild.hpp"
|
||||
#include "EnvVars.hpp"
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
// This class facilitates testing RCCL collectives across various process / device configurations
|
||||
//
|
||||
class TestBed
|
||||
{
|
||||
public:
|
||||
int numDevicesAvailable; // # of devices detected on node
|
||||
std::vector<TestBedChild*> childList; // List of child processes
|
||||
std::vector<int> rankToChildMap; // Tracks which child process each rank is assigned to
|
||||
std::vector<int> rankToDeviceMap; // Tracks which device each rank is assigned to
|
||||
int numActiveChildren; // List of active children (with usable RCCL comms)
|
||||
int numActiveRanks; // Current # of ranks in use
|
||||
int numCollectivesInGroup; // # of collectives to execute per group call
|
||||
|
||||
EnvVars ev; // Environment variables
|
||||
|
||||
// Constructor - Creates one child process per detected GPU device that waits for further commands
|
||||
TestBed();
|
||||
|
||||
// Prepare TestBed for use with GPUs across multiple child processes
|
||||
void InitComms(std::vector<std::vector<int>> const& deviceIdsPerChild, int const numCollectivesInGroup = 1);
|
||||
// Prepare TestBed for use with GPUs on a single child process
|
||||
void InitComms(int const numGpus, int const numCollectivesInGroup = 1);
|
||||
|
||||
// Set collectives arguments for specified collective / rank
|
||||
// Setting scalarsPerRank to non-null will create custom reduction operator
|
||||
// Using collId = -1 (default) applies settings to all collectives in group
|
||||
// Using rank = -1 (default) applies settings to all ranks
|
||||
|
||||
void SetCollectiveArgs(ncclFunc_t const funcType,
|
||||
ncclDataType_t const dataType,
|
||||
ncclRedOp_t const redOp,
|
||||
int const root,
|
||||
size_t const numInputElements,
|
||||
size_t const numOutputElements,
|
||||
int const collId = -1,
|
||||
int const rank = -1,
|
||||
PtrUnion const scalarsPerRank = {nullptr},
|
||||
int const scalarMode = -1);
|
||||
|
||||
// Allocate memory for specified collective / rank
|
||||
// - Requires SetCollectiveArgs to have been called already
|
||||
// Using collId = -1 (default) applies settings to all collectives in group
|
||||
// Using rank = -1 (default) applies settings to all ranks
|
||||
void AllocateMem(bool const inPlace = false,
|
||||
bool const useManagedMem = false,
|
||||
int const collId = -1,
|
||||
int const rank = -1);
|
||||
|
||||
// Initialize input and compute expected results
|
||||
// - requires that SetCollectiveArgs and AllocateMemory have already been called
|
||||
// Setting collId to -1 applies settings to all collectives in group
|
||||
// Setting rank to -1 applies settings to all ranks
|
||||
// Setting prepDataFunc to nullptr uses the default fill pattern routine
|
||||
void PrepareData(int const collId = -1,
|
||||
int const rank = -1,
|
||||
CollFuncPtr const prepDataFunc = nullptr);
|
||||
|
||||
// Execute all collectives on all test children
|
||||
// Blocks until collective is completed
|
||||
void ExecuteCollectives();
|
||||
|
||||
// Perform results validation - compare output to expected
|
||||
void ValidateResults(bool& isCorrect, int collId = -1, int const rank = -1);
|
||||
|
||||
// Release allocated memory
|
||||
void DeallocateMem(int collId = -1, int const rank = -1);
|
||||
|
||||
// Release the RCCL comms
|
||||
void DestroyComms();
|
||||
|
||||
// Explicit TestBed destructor that releases all child processes
|
||||
// No further calls to TestBed should be performed after this call
|
||||
void Finalize();
|
||||
|
||||
// Destructor - Calls Finalize() to release all child processes
|
||||
~TestBed();
|
||||
|
||||
// Returns all the supported reduction operations based on build settings
|
||||
std::vector<ncclRedOp_t> const& GetAllSupportedRedOps();
|
||||
|
||||
// Return all the supported data types based on build settings
|
||||
std::vector<ncclDataType_t> const& GetAllSupportedDataTypes();
|
||||
|
||||
// Helper function that splits up GPUs to the given number of processes
|
||||
static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
|
||||
int const numGpus);
|
||||
|
||||
// Generate a test case name
|
||||
static std::string GetTestCaseName(int const totalRanks,
|
||||
bool const isMultiProcess,
|
||||
ncclFunc_t const funcType,
|
||||
ncclDataType_t const dataType,
|
||||
ncclRedOp_t const redOp,
|
||||
int const root,
|
||||
bool const inPlace,
|
||||
bool const managedMem);
|
||||
|
||||
// Run a simple sweep
|
||||
void RunSimpleSweep(std::vector<ncclFunc_t> const& funcTypes,
|
||||
std::vector<ncclDataType_t> const& dataTypes,
|
||||
std::vector<ncclRedOp_t> const& redOps,
|
||||
std::vector<int> const& roots,
|
||||
std::vector<int> const& numElements,
|
||||
std::vector<bool> const& inPlaceList,
|
||||
std::vector<bool> const& managedMemList);
|
||||
|
||||
// Used to track total number of calls to ExecuteCollectives()
|
||||
static int& NumTestsRun();
|
||||
|
||||
protected:
|
||||
// Ends the specified child process
|
||||
void StopChild(int const childId);
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,589 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "TestBedChild.hpp"
|
||||
#include <thread>
|
||||
|
||||
#define CHILD_NCCL_CALL(cmd, msg) \
|
||||
{ \
|
||||
if (this->verbose) printf("[ NCCL CALL] " #cmd "\n"); \
|
||||
ncclResult_t status = cmd; \
|
||||
if (status != ncclSuccess) \
|
||||
{ \
|
||||
ERROR("Child process %d fails NCCL call %s with code %d\n", this->childId, msg, status); \
|
||||
return TEST_FAIL; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define PIPE_READ(val) \
|
||||
if (read(childReadFd, &val, sizeof(val)) != sizeof(val)) return TEST_FAIL;
|
||||
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
TestBedChild::TestBedChild(int const childId, bool const verbose, int const printValues)
|
||||
{
|
||||
this->childId = childId;
|
||||
this->verbose = verbose;
|
||||
this->printValues = printValues;
|
||||
}
|
||||
|
||||
int TestBedChild::InitPipes()
|
||||
{
|
||||
// Prepare parent->child pipe
|
||||
int pipefd[2];
|
||||
if (pipe(pipefd) == -1)
|
||||
{
|
||||
ERROR("Unable to create parent->child pipe for child %d\n", this->childId);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
this->childReadFd = pipefd[0];
|
||||
this->parentWriteFd = pipefd[1];
|
||||
|
||||
// Prepare child->parent pipe
|
||||
this->parentReadFd = -1;
|
||||
if (pipe(pipefd) == -1)
|
||||
{
|
||||
ERROR("Unable to create parent->child pipe for child %d\n", this->childId);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
this->parentReadFd = pipefd[0];
|
||||
this->childWriteFd = pipefd[1];
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
void TestBedChild::StartExecutionLoop()
|
||||
{
|
||||
// Close unused ends of pipes
|
||||
close(this->parentWriteFd);
|
||||
close(this->parentReadFd);
|
||||
|
||||
// Wait for commands from parent process
|
||||
if (verbose) INFO("Child %d enters execution loop\n", this->childId);
|
||||
int command;
|
||||
while (read(childReadFd, &command, sizeof(command)) > 0)
|
||||
{
|
||||
if (verbose) INFO("Child %d received command [%s]:\n", this->childId, ChildCommandNames[command]);;
|
||||
ErrCode status = TEST_SUCCESS;
|
||||
switch(command)
|
||||
{
|
||||
case CHILD_INIT_COMMS : status = InitComms(); break;
|
||||
case CHILD_SET_COLL_ARGS : status = SetCollectiveArgs(); break;
|
||||
case CHILD_ALLOCATE_MEM : status = AllocateMem(); break;
|
||||
case CHILD_PREPARE_DATA : status = PrepareData(); break;
|
||||
case CHILD_EXECUTE_COLL : status = ExecuteCollectives(); break;
|
||||
case CHILD_VALIDATE_RESULTS: status = ValidateResults(); break;
|
||||
case CHILD_DEALLOCATE_MEM : status = DeallocateMem(); break;
|
||||
case CHILD_DESTROY_COMMS : status = DestroyComms(); break;
|
||||
case CHILD_STOP : status = Stop(); break;
|
||||
default: exit(0);
|
||||
}
|
||||
|
||||
// Send back acknowledgement to parent
|
||||
if (status == TEST_FAIL)
|
||||
ERROR("Child %d failed on command [%s]:\n", this->childId, ChildCommandNames[command]);
|
||||
write(childWriteFd, &status, sizeof(status));
|
||||
}
|
||||
|
||||
// Close child ends of pipe
|
||||
close(this->childReadFd);
|
||||
close(this->childWriteFd);
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
||||
ErrCode TestBedChild::InitComms()
|
||||
{
|
||||
if (this->verbose) INFO("Child %d begins InitComms()\n", this->childId);
|
||||
|
||||
// Read values sent by parent [see TestBed::InitComms()]
|
||||
PIPE_READ(this->totalRanks);
|
||||
PIPE_READ(this->rankOffset);
|
||||
PIPE_READ(this->numCollectivesInGroup);
|
||||
|
||||
// Read the GPUs this child uses and prepare storage for collective args / datasets
|
||||
int numGpus;
|
||||
PIPE_READ(numGpus);
|
||||
this->deviceIds.resize(numGpus);
|
||||
this->streams.resize(numGpus);
|
||||
this->collArgs.resize(numGpus);
|
||||
for (int i = 0; i < numGpus; i++)
|
||||
{
|
||||
PIPE_READ(this->deviceIds[i]);
|
||||
this->collArgs[i].clear();
|
||||
this->collArgs[i].resize(numCollectivesInGroup);
|
||||
}
|
||||
|
||||
// Collect uniqueId (specified by NCCL_COMM_ID env var)
|
||||
ncclUniqueId id;
|
||||
CHILD_NCCL_CALL(ncclGetUniqueId(&id), "ncclGetUniqueId");
|
||||
|
||||
// Initialize communicators
|
||||
comms.clear();
|
||||
comms.resize(numGpus);
|
||||
|
||||
// Initialize within a group call to avoid deadlock when using multiple ranks per child
|
||||
ErrCode status = TEST_SUCCESS;
|
||||
CHILD_NCCL_CALL(ncclGroupStart(), "ncclGroupStart");
|
||||
for (int localRank = 0; localRank < numGpus; ++localRank)
|
||||
{
|
||||
int const globalRank = this->rankOffset + localRank;
|
||||
int const currGpu = this->deviceIds[localRank];
|
||||
|
||||
if (hipSetDevice(currGpu) != hipSuccess)
|
||||
{
|
||||
ERROR("Rank %d on child %d unable to switch to GPU %d\n", globalRank, this->childId, currGpu);
|
||||
status = TEST_FAIL;
|
||||
break;
|
||||
}
|
||||
|
||||
if (hipStreamCreate(&this->streams[localRank]) != hipSuccess)
|
||||
{
|
||||
ERROR("Rank %d on child %d unable to create stream for GPU %d\n", globalRank, this->childId, currGpu);
|
||||
status = TEST_FAIL;
|
||||
break;
|
||||
}
|
||||
|
||||
if (ncclCommInitRank(&this->comms[localRank], this->totalRanks, id, globalRank) != ncclSuccess)
|
||||
{
|
||||
ERROR("Rank %d on child %d unable to call ncclCommInitRank\n", globalRank, this->childId);
|
||||
status = TEST_FAIL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (status == TEST_SUCCESS)
|
||||
{
|
||||
CHILD_NCCL_CALL(ncclGroupEnd(), "ncclGroupStart");
|
||||
}
|
||||
if (this->verbose) INFO("Child %d finishes InitComms() [%s]\n",
|
||||
this->childId, status == TEST_SUCCESS ? "SUCCESS" : "FAIL");
|
||||
return status;
|
||||
}
|
||||
|
||||
ErrCode TestBedChild::SetCollectiveArgs()
|
||||
{
|
||||
if (this->verbose) INFO("Child %d begins SetCollectiveArgs()\n", this->childId);
|
||||
|
||||
// Read values sent by parent [see TestBed::SetCollectiveArgs()]
|
||||
int globalRank;
|
||||
int collId;
|
||||
ncclFunc_t funcType;
|
||||
ncclDataType_t dataType;
|
||||
ncclRedOp_t redOp;
|
||||
int root;
|
||||
size_t numInputElements;
|
||||
size_t numOutputElements;
|
||||
ScalarTransport scalarTransport;
|
||||
int scalarMode;
|
||||
|
||||
PIPE_READ(globalRank);
|
||||
PIPE_READ(collId);
|
||||
PIPE_READ(funcType);
|
||||
PIPE_READ(dataType);
|
||||
PIPE_READ(redOp);
|
||||
PIPE_READ(root);
|
||||
PIPE_READ(numInputElements);
|
||||
PIPE_READ(numOutputElements);
|
||||
PIPE_READ(scalarMode);
|
||||
PIPE_READ(scalarTransport);
|
||||
|
||||
for (int i = 0; i < this->totalRanks; i++)
|
||||
{
|
||||
PtrUnion scalarsPerRank;
|
||||
scalarsPerRank.Attach(scalarTransport.ptr);
|
||||
}
|
||||
|
||||
if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
|
||||
{
|
||||
ERROR("Child %d does not contain rank %d\n", this->childId, globalRank);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
int const localRank = globalRank - rankOffset;
|
||||
CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
|
||||
|
||||
for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx)
|
||||
{
|
||||
if (collId == -1 || collId == collIdx)
|
||||
{
|
||||
CollectiveArgs& collArg = this->collArgs[localRank][collIdx];
|
||||
CHECK_CALL(collArg.SetArgs(globalRank, this->totalRanks,
|
||||
this->deviceIds[localRank],
|
||||
funcType, dataType, redOp, root,
|
||||
numInputElements, numOutputElements,
|
||||
scalarTransport, scalarMode));
|
||||
if (this->verbose) INFO("Rank %d on child %d sets collective %d [%s]\n",
|
||||
globalRank, this->childId, collIdx,
|
||||
collArg.GetDescription().c_str());
|
||||
|
||||
// If pre-mult scalars are provided, then create a custom reduction operator
|
||||
if (scalarMode >= 0)
|
||||
{
|
||||
CHILD_NCCL_CALL(ncclRedOpCreatePreMulSum(&collArg.redOp,
|
||||
collArg.localScalar.ptr,
|
||||
dataType,
|
||||
(ncclScalarResidence_t)scalarMode,
|
||||
this->comms[localRank]),
|
||||
"ncclRedOpCreatePreMulSum");
|
||||
if (verbose) INFO("Child %d created custom redop %d for collective %d\n",
|
||||
this->childId, collArg.redOp, collIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (this->verbose) INFO("Child %d finishes SetCollectiveArgs()\n", this->childId);
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode TestBedChild::AllocateMem()
|
||||
{
|
||||
if (this->verbose) INFO("Child %d begins AllocateMem()\n", this->childId);
|
||||
|
||||
// Read values sent by parent [see TestBed::AllocateMem()]
|
||||
int globalRank;
|
||||
int collId;
|
||||
bool inPlace;
|
||||
bool useManagedMem;
|
||||
|
||||
PIPE_READ(globalRank);
|
||||
PIPE_READ(collId);
|
||||
PIPE_READ(inPlace);
|
||||
PIPE_READ(useManagedMem);
|
||||
|
||||
if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
|
||||
{
|
||||
ERROR("Child %d does not contain rank %d\n", this->childId, globalRank);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
int const localRank = globalRank - rankOffset;
|
||||
CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
|
||||
|
||||
for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx)
|
||||
{
|
||||
if (collId == -1 || collId == collIdx)
|
||||
{
|
||||
CollectiveArgs& collArg = this->collArgs[localRank][collIdx];
|
||||
CHECK_CALL(collArg.AllocateMem(inPlace, useManagedMem));
|
||||
if (this->verbose) INFO("Rank %d on child %d allocates memory for collective %d on device %d (%s,%s) Input: %p Output %p\n",
|
||||
globalRank, this->childId, collIdx, this->deviceIds[localRank],
|
||||
inPlace ? "in-place" : "out-of-place",
|
||||
useManagedMem ? "managed" : "unmanaged",
|
||||
collArg.inputGpu.ptr,
|
||||
collArg.outputGpu.ptr);
|
||||
}
|
||||
}
|
||||
|
||||
if (this->verbose) INFO("Child %d finishes AllocateMem()\n", this->childId);
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
// Fill input memory with pre-known patterned based on rank
|
||||
ErrCode TestBedChild::PrepareData()
|
||||
{
|
||||
if (this->verbose) INFO("Child %d begins PrepareData()\n", this->childId);
|
||||
|
||||
// Read values sent by parent [see TestBed::PrepareData()]
|
||||
int globalRank;
|
||||
int collId;
|
||||
CollFuncPtr prepDataFunc;
|
||||
|
||||
PIPE_READ(globalRank);
|
||||
PIPE_READ(collId);
|
||||
PIPE_READ(prepDataFunc);
|
||||
|
||||
if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
|
||||
{
|
||||
ERROR("Child %d does not contain rank %d\n", this->childId, globalRank);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
|
||||
int const localRank = globalRank - rankOffset;
|
||||
CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
|
||||
|
||||
for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx)
|
||||
{
|
||||
if (collId == -1 || collId == collIdx)
|
||||
{
|
||||
if (this->verbose) INFO("Rank %d on child %d prepares data for collective %d\n",
|
||||
globalRank, this->childId, collIdx);
|
||||
CHECK_CALL(this->collArgs[localRank][collIdx].PrepareData(prepDataFunc));
|
||||
}
|
||||
}
|
||||
if (this->verbose) INFO("Child %d finishes PrepareData()\n", this->childId);
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode TestBedChild::ExecuteCollectives()
|
||||
{
|
||||
if (this->verbose) INFO("Child %d begins ExecuteCollectives()\n", this->childId);
|
||||
|
||||
// Start group call
|
||||
CHILD_NCCL_CALL(ncclGroupStart(), "ncclGroupStart");
|
||||
|
||||
// Loop over all collectives to be executed in group call
|
||||
for (int collId = 0; collId < this->numCollectivesInGroup; ++collId)
|
||||
{
|
||||
// Loop over all local ranks
|
||||
for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank)
|
||||
{
|
||||
CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
|
||||
|
||||
CollectiveArgs const& collArg = this->collArgs[localRank][collId];
|
||||
|
||||
if (this->printValues)
|
||||
{
|
||||
int const numInputElementsToPrint = (this->printValues < 0 ? collArg.numInputElements : this->printValues);
|
||||
PtrUnion inputCpu;
|
||||
size_t const numInputBytes = numInputElementsToPrint * DataTypeToBytes(collArg.dataType);
|
||||
inputCpu.AllocateCpuMem(numInputBytes);
|
||||
CHECK_HIP(hipMemcpy(inputCpu.ptr, collArg.inputGpu.ptr, numInputBytes, hipMemcpyDeviceToHost));
|
||||
printf("[ DEBUG ] Rank %02d Coll %d %-10s: %s\n", collArg.globalRank, collId, "Input",
|
||||
inputCpu.ToString(collArg.dataType, numInputElementsToPrint).c_str());
|
||||
inputCpu.FreeCpuMem();
|
||||
|
||||
int const numOutputElementsToPrint = (this->printValues < 0 ? collArg.numOutputElements : this->printValues);
|
||||
size_t const numOutputBytes = numOutputElementsToPrint * DataTypeToBytes(collArg.dataType);
|
||||
CHECK_HIP(hipMemcpy(collArg.outputCpu.ptr, collArg.outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost));
|
||||
printf("[ DEBUG ] Rank %02d Coll %d %-10s: %s\n", collArg.globalRank, collId, "Pre-Output",
|
||||
collArg.outputCpu.ToString(collArg.dataType, numOutputElementsToPrint).c_str());
|
||||
}
|
||||
|
||||
switch (collArg.funcType)
|
||||
{
|
||||
case ncclCollBroadcast:
|
||||
CHILD_NCCL_CALL(ncclBroadcast(collArg.inputGpu.ptr,
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numInputElements,
|
||||
collArg.dataType,
|
||||
collArg.root,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclBroadcast");
|
||||
break;
|
||||
case ncclCollReduce:
|
||||
CHILD_NCCL_CALL(ncclReduce(collArg.inputGpu.ptr,
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numInputElements,
|
||||
collArg.dataType,
|
||||
collArg.redOp,
|
||||
collArg.root,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclReduce");
|
||||
break;
|
||||
case ncclCollAllGather:
|
||||
CHILD_NCCL_CALL(ncclAllGather(collArg.inputGpu.ptr,
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numInputElements,
|
||||
collArg.dataType,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclAllGather");
|
||||
break;
|
||||
case ncclCollReduceScatter:
|
||||
CHILD_NCCL_CALL(ncclReduceScatter(collArg.inputGpu.ptr,
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numOutputElements,
|
||||
collArg.dataType,
|
||||
collArg.redOp,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclReduceScatter");
|
||||
break;
|
||||
case ncclCollAllReduce:
|
||||
CHILD_NCCL_CALL(ncclAllReduce(collArg.inputGpu.ptr,
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numInputElements,
|
||||
collArg.dataType,
|
||||
collArg.redOp,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclAllReduce");
|
||||
break;
|
||||
case ncclCollGather:
|
||||
CHILD_NCCL_CALL(ncclGather(collArg.inputGpu.ptr,
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numInputElements,
|
||||
collArg.dataType,
|
||||
collArg.root,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclGather");
|
||||
break;
|
||||
case ncclCollScatter:
|
||||
CHILD_NCCL_CALL(ncclScatter(collArg.inputGpu.ptr,
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numOutputElements,
|
||||
collArg.dataType,
|
||||
collArg.root,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclScatter");
|
||||
break;
|
||||
case ncclCollAllToAll:
|
||||
CHILD_NCCL_CALL(ncclAllToAll(collArg.inputGpu.ptr,
|
||||
collArg.outputGpu.ptr,
|
||||
collArg.numInputElements / collArg.totalRanks,
|
||||
collArg.dataType,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclAllToAll");
|
||||
break;
|
||||
case ncclCollSend:
|
||||
CHILD_NCCL_CALL(ncclSend(collArg.inputGpu.ptr,
|
||||
collArg.numInputElements,
|
||||
collArg.dataType,
|
||||
collArg.root,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclSend");
|
||||
break;
|
||||
case ncclCollRecv:
|
||||
CHILD_NCCL_CALL(ncclRecv(collArg.outputGpu.ptr,
|
||||
collArg.numOutputElements,
|
||||
collArg.dataType,
|
||||
collArg.root,
|
||||
this->comms[localRank],
|
||||
this->streams[localRank]),
|
||||
"ncclRecv");
|
||||
break;
|
||||
default:
|
||||
ERROR("Unknown func type %d\n", collArg.funcType);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// End group call
|
||||
CHILD_NCCL_CALL(ncclGroupEnd(), "ncclGroupEnd");
|
||||
|
||||
// Synchronize
|
||||
if (this->verbose) INFO("Child %d submits group call. Waiting for completion\n", this->childId);
|
||||
for (int localRank = 0; localRank < this->streams.size(); ++localRank)
|
||||
{
|
||||
CHECK_HIP(hipStreamSynchronize(this->streams[localRank]));
|
||||
}
|
||||
|
||||
if (this->printValues)
|
||||
{
|
||||
for (int collId = 0; collId < this->numCollectivesInGroup; ++collId)
|
||||
for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank)
|
||||
{
|
||||
CollectiveArgs const& collArg = this->collArgs[localRank][collId];
|
||||
|
||||
int numOutputElementsToPrint = (this->printValues < 0 ? collArg.numOutputElements : this->printValues);
|
||||
size_t const numOutputBytes = numOutputElementsToPrint * DataTypeToBytes(collArg.dataType);
|
||||
CHECK_HIP(hipMemcpy(collArg.outputCpu.ptr, collArg.outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost));
|
||||
printf("[ DEBUG ] Rank %02d Coll %d %-10s: %s\n", collArg.globalRank, collId, "Output",
|
||||
collArg.outputCpu.ToString(collArg.dataType, numOutputElementsToPrint).c_str());
|
||||
|
||||
printf("[ DEBUG ] Rank %02d Coll %d %-10s: %s\n", collArg.globalRank, collId, "Expected",
|
||||
collArg.expected.ToString(collArg.dataType, numOutputElementsToPrint).c_str());
|
||||
}
|
||||
}
|
||||
if (this->verbose) INFO("Child %d finishes ExecuteCollectives()\n", this->childId);
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode TestBedChild::ValidateResults()
|
||||
{
|
||||
// Read values sent by parent [see TestBed::ValidateResults()]
|
||||
int globalRank, collId;
|
||||
PIPE_READ(globalRank);
|
||||
PIPE_READ(collId);
|
||||
|
||||
if (this->verbose) INFO("Child %d begins ValidateResults()\n", this->childId);
|
||||
|
||||
if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
|
||||
{
|
||||
ERROR("Child %d does not contain rank %d\n", this->childId, globalRank);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
int const localRank = globalRank - rankOffset;
|
||||
CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
|
||||
|
||||
ErrCode status = TEST_SUCCESS;
|
||||
for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx)
|
||||
{
|
||||
if (collId == -1 || collId == collIdx)
|
||||
{
|
||||
if (this->verbose) INFO("Rank %d on child %d validating collective %d results\n",
|
||||
globalRank, this->childId, collIdx);
|
||||
if (this->collArgs[localRank][collIdx].ValidateResults() != TEST_SUCCESS)
|
||||
{
|
||||
ERROR("Rank %d Collective %d output does not match expected\n", globalRank, collIdx);
|
||||
status = TEST_FAIL;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (this->verbose) INFO("Child %d finishes ValidateResults() with status %s\n", this->childId,
|
||||
status == TEST_SUCCESS ? "SUCCESS" : "FAIL");
|
||||
return status;
|
||||
}
|
||||
|
||||
ErrCode TestBedChild::DeallocateMem()
|
||||
{
|
||||
if (this->verbose) INFO("Child %d begins DeallocateMem\n", this->childId);
|
||||
|
||||
// Read values sent by parent [see TestBed::DeallocateMem()]
|
||||
int globalRank, collId;
|
||||
PIPE_READ(globalRank);
|
||||
PIPE_READ(collId);
|
||||
|
||||
if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
|
||||
{
|
||||
ERROR("Child %d does not contain rank %d\n", this->childId, globalRank);
|
||||
return TEST_FAIL;
|
||||
}
|
||||
int const localRank = globalRank - rankOffset;
|
||||
CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
|
||||
|
||||
for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx)
|
||||
{
|
||||
CollectiveArgs& collArg = this->collArgs[localRank][collIdx];
|
||||
if (collId == -1 || collId == collIdx)
|
||||
{
|
||||
if (this->verbose)
|
||||
{
|
||||
INFO("Child %d release memory for collective %d (Input: %p Output %p\n",
|
||||
this->childId, collIdx, collArg.inputGpu.ptr, collArg.outputGpu.ptr);
|
||||
}
|
||||
|
||||
CHECK_CALL(collArg.DeallocateMem());
|
||||
}
|
||||
if (collArg.scalarMode != -1)
|
||||
{
|
||||
CHILD_NCCL_CALL(ncclRedOpDestroy(collArg.redOp, this->comms[localRank]),
|
||||
"ncclRedOpDestroy");
|
||||
if (verbose) INFO("Child %d destroys custom redop %d for collective %d\n",
|
||||
this->childId, collArg.redOp, collIdx);
|
||||
}
|
||||
}
|
||||
if (this->verbose) INFO("Child %d finishes DeallocateMem\n", this->childId);
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode TestBedChild::DestroyComms()
|
||||
{
|
||||
if (this->verbose) INFO("Child %d begins DestroyComms\n", this->childId);
|
||||
|
||||
// Release comms
|
||||
for (int i = 0; i < this->comms.size(); ++i)
|
||||
{
|
||||
CHILD_NCCL_CALL(ncclCommDestroy(this->comms[i]), "ncclCommDestroy");
|
||||
}
|
||||
for (int i = 0; i < this->streams.size(); ++i)
|
||||
{
|
||||
CHECK_HIP(hipStreamDestroy(this->streams[i]));
|
||||
}
|
||||
this->comms.clear();
|
||||
this->streams.clear();
|
||||
if (this->verbose) INFO("Child %d finishes DestroyComms\n", this->childId);
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
|
||||
ErrCode TestBedChild::Stop()
|
||||
{
|
||||
return TEST_SUCCESS;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,106 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <unistd.h>
|
||||
#include "CollectiveArgs.hpp"
|
||||
#include "rccl.h"
|
||||
|
||||
#define MAX_RANKS 32
|
||||
namespace RcclUnitTesting
|
||||
{
|
||||
class TestBedChild
|
||||
{
|
||||
public:
|
||||
// These are commands that can be given to the child process
|
||||
enum
|
||||
{
|
||||
CHILD_INIT_COMMS = 0, // InitComms()
|
||||
CHILD_SET_COLL_ARGS = 1, // SetCollectiveArgs()
|
||||
CHILD_ALLOCATE_MEM = 2, // AllocateMem()
|
||||
CHILD_PREPARE_DATA = 3, // PrepareData()
|
||||
CHILD_EXECUTE_COLL = 4, // ExecuteCollectives()
|
||||
CHILD_VALIDATE_RESULTS = 5, // ValidateResults()
|
||||
CHILD_DEALLOCATE_MEM = 6, // DeallocateMem()
|
||||
CHILD_DESTROY_COMMS = 7, // DestroyComms()
|
||||
CHILD_STOP = 8, // Stop()
|
||||
NUM_CHILD_COMMANDS = 9
|
||||
};
|
||||
|
||||
char const ChildCommandNames[NUM_CHILD_COMMANDS][20] =
|
||||
{
|
||||
"INIT_COMMS",
|
||||
"SET_COLL_ARGS",
|
||||
"ALLOCATE_MEM",
|
||||
"PREPARE_DATA",
|
||||
"EXECUTE_COLL",
|
||||
"VALIDATE_RESULTS",
|
||||
"DEALLOCATE_MEM",
|
||||
"DESTROY_COMMS",
|
||||
"STOP"
|
||||
};
|
||||
|
||||
// These variables remain constant for life of TestBedChild
|
||||
int childId;
|
||||
pid_t pid;
|
||||
bool verbose;
|
||||
int printValues;
|
||||
|
||||
// Pipes used to communicate between parent process
|
||||
int parentWriteFd;
|
||||
int parentReadFd;
|
||||
int childWriteFd;
|
||||
int childReadFd;
|
||||
|
||||
// These varibles may change based on commands issued by parent
|
||||
int totalRanks; // Total ranks
|
||||
int rankOffset; // Global rank offset for this child
|
||||
int numCollectivesInGroup; // # of collectives to run per group call
|
||||
std::vector<ncclComm_t> comms; // RCCL communicators for each rank
|
||||
std::vector<int> deviceIds; // Device IDs for each rank
|
||||
std::vector<hipStream_t> streams; // Streams for executing collectives
|
||||
std::vector<std::vector<CollectiveArgs>> collArgs; // Info for each collective for each rank
|
||||
|
||||
// Constructor
|
||||
TestBedChild(int const childId, bool const verbose, int const printValues);
|
||||
|
||||
// Prepare parent/child communication pipes - to be executed by parent process
|
||||
int InitPipes();
|
||||
|
||||
// Execution
|
||||
void StartExecutionLoop();
|
||||
|
||||
protected:
|
||||
// Initialize RCCL communicators
|
||||
ErrCode InitComms();
|
||||
|
||||
// Set CollectiveArgs
|
||||
ErrCode SetCollectiveArgs();
|
||||
|
||||
// Allocate memory (input (GPU) / output (GPU) / expected (CPU))
|
||||
ErrCode AllocateMem();
|
||||
|
||||
// Prepare input and expected data
|
||||
ErrCode PrepareData();
|
||||
|
||||
// Execute a group of collectives
|
||||
ErrCode ExecuteCollectives();
|
||||
|
||||
// Validate that output matches expected
|
||||
ErrCode ValidateResults();
|
||||
|
||||
// Release allocated memory
|
||||
ErrCode DeallocateMem();
|
||||
|
||||
// Destroys RCCL communicators
|
||||
ErrCode DestroyComms();
|
||||
|
||||
// Stops this child process
|
||||
ErrCode Stop();
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include "EnvVars.hpp"
|
||||
#include "TestBed.hpp"
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
RcclUnitTesting::EnvVars::ShowConfig();
|
||||
int retCode = RUN_ALL_TESTS();
|
||||
printf("[ INFO ] Total executed cases: %d\n", RcclUnitTesting::TestBed::NumTestsRun());
|
||||
return retCode;
|
||||
}
|
||||
@@ -1,117 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "test_AllGather.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(AllGatherCorrectnessTest, Correctness)
|
||||
{
|
||||
// Adjust numElements to be multiple of numDevices
|
||||
numElements = (numElements/numDevices)*numDevices;
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
if (numElements % numDevices != 0) return;
|
||||
|
||||
// Prepare input / output / expected results
|
||||
Dataset dataset;
|
||||
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllGather);
|
||||
FillDatasetWithPattern(dataset);
|
||||
ComputeExpectedResults(dataset);
|
||||
|
||||
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
|
||||
size_t const sendCount = dataset.numElements / dataset.numDevices;
|
||||
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclAllGather((int8_t *)dataset.inputs[i] + (i * byteCount),
|
||||
dataset.outputs[i], sendCount,
|
||||
dataType, comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset);
|
||||
dataset.Release();
|
||||
}
|
||||
|
||||
TEST_P(AllGatherCorrectnessTest, Alignment)
|
||||
{
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
if (numElements % numDevices != 0) return;
|
||||
|
||||
// Allocate dataset
|
||||
Dataset dataset;
|
||||
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllGather);
|
||||
|
||||
// Loop over several offsets (so that device pointers are not aligned)
|
||||
for (int firstElement = 1; firstElement <= 11; firstElement += 2)
|
||||
{
|
||||
if (firstElement < numElements)
|
||||
{
|
||||
// Select last element so that total number of elements is multiple of numDevices
|
||||
int const lastElement = firstElement + ((numElements - firstElement) / numDevices) * numDevices - 1;
|
||||
if (lastElement >= numElements) break;
|
||||
|
||||
Dataset subDataset;
|
||||
dataset.ExtractSubDataset(firstElement, lastElement, subDataset);
|
||||
|
||||
// Compute reference results for sub-dataset
|
||||
FillDatasetWithPattern(subDataset);
|
||||
ComputeExpectedResults(subDataset);
|
||||
|
||||
size_t const byteCount = subDataset.NumBytes() / subDataset.numDevices;
|
||||
size_t const sendCount = subDataset.numElements / subDataset.numDevices;
|
||||
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclAllGather((int8_t *)subDataset.inputs[i] + (i * byteCount),
|
||||
subDataset.outputs[i], sendCount,
|
||||
dataType, comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
// Check results
|
||||
ValidateResults(subDataset);
|
||||
}
|
||||
}
|
||||
dataset.Release();
|
||||
}
|
||||
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(AllGatherCorrectnessSweep,
|
||||
AllGatherCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator (not used)
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(2520, 3026520),
|
||||
// Number of devices
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,34 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_ALLGATHER_HPP
|
||||
#define TEST_ALLGATHER_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class AllGatherCorrectnessTest : public CorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset)
|
||||
{
|
||||
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
|
||||
|
||||
int8_t* result = (int8_t *)malloc(dataset.NumBytes());
|
||||
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
HIP_CALL(hipMemcpy(result + i * byteCount, (int8_t *)dataset.inputs[i] + (i * byteCount),
|
||||
byteCount, hipMemcpyDeviceToHost));
|
||||
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
memcpy(dataset.expected[i], result, dataset.NumBytes());
|
||||
|
||||
free(result);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,60 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "test_AllGatherMultiProcess.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(AllGatherMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllGather);
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
bool pass;
|
||||
TestAllGather(gpu, *dataset, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
}
|
||||
|
||||
ValidateProcesses(pids);
|
||||
dataset->ReleaseRootProcess();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(AllGatherMultiProcessCorrectnessSweep,
|
||||
AllGatherMultiProcessCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator (not used)
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(3072, 3145728),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,81 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_ALLGATHER_MULTI_PROCESS_HPP
|
||||
#define TEST_ALLGATHER_MULTI_PROCESS_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class AllGatherMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, int const numDevices, std::vector<int> const& ranks)
|
||||
{
|
||||
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
|
||||
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
HIP_CALL(hipMemcpy(static_cast<char*>(dataset.expected[0]) + rank * byteCount, (int8_t *)dataset.inputs[rank] + (rank * byteCount),
|
||||
byteCount, hipMemcpyDeviceToHost));
|
||||
}
|
||||
barrier.Wait();
|
||||
|
||||
// Rank 0 sends answer to other ranks
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
if (rank == 0)
|
||||
{
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
{
|
||||
if (i == rank) continue;
|
||||
memcpy(dataset.expected[i], dataset.expected[0], dataset.NumBytes());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestAllGather(int rank, Dataset& dataset, bool& pass)
|
||||
{
|
||||
// Prepare input / output / expected results
|
||||
SetUpPerProcess(rank, ncclCollAllGather, comms[rank], streams[rank], dataset);
|
||||
|
||||
if (numDevices > numDevicesAvailable || numElements % numDevices != 0)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
|
||||
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset, rank);
|
||||
|
||||
ComputeExpectedResults(dataset, barrier, numDevices, std::vector<int>(1, rank));
|
||||
|
||||
size_t const byteCount = dataset.NumBytes() / numDevices;
|
||||
size_t const sendCount = dataset.numElements / numDevices;
|
||||
|
||||
// Launch the reduction (1 process per GPU)
|
||||
ncclAllGather((int8_t *)dataset.inputs[rank] + (rank * byteCount),
|
||||
dataset.outputs[rank], sendCount,
|
||||
dataType, comms[rank], streams[rank]);
|
||||
|
||||
// Wait for reduction to complete
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
|
||||
// Check results
|
||||
pass = ValidateResults(dataset, rank);
|
||||
|
||||
TearDownPerProcess(comms[rank], streams[rank]);
|
||||
dataset.Release(rank);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,80 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_AllReduce.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(AllReduceCorrectnessTest, Correctness)
|
||||
{
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
|
||||
// Prepare input / output / expected results
|
||||
Dataset dataset;
|
||||
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
|
||||
FillDatasetWithPattern(dataset);
|
||||
ComputeExpectedResults(dataset, op);
|
||||
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
|
||||
numElements, dataType, op, comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset);
|
||||
|
||||
dataset.Release();
|
||||
}
|
||||
#if defined(BUILD_ALLREDUCE_ONLY)
|
||||
INSTANTIATE_TEST_SUITE_P(AllReduceCorrectnessSweep,
|
||||
AllReduceCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclFloat32),
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
#else
|
||||
INSTANTIATE_TEST_SUITE_P(AllReduceCorrectnessSweep,
|
||||
AllReduceCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator
|
||||
testing::Values(ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
#endif
|
||||
} // namespace
|
||||
@@ -1,83 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_ALLREDUCE_HPP
|
||||
#define TEST_ALLREDUCE_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class AllReduceCorrectnessTest : public CorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op)
|
||||
{
|
||||
// Copy all inputs to expected arrays temporarily to perform reduction on host
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
|
||||
// Allocate temporary host array to accumulate results
|
||||
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
|
||||
uint8_t* resultU1 = (uint8_t *)resultI1;
|
||||
int32_t* resultI4 = (int32_t *)resultI1;
|
||||
uint32_t* resultU4 = (uint32_t *)resultI1;
|
||||
int64_t* resultI8 = (int64_t *)resultI1;
|
||||
uint64_t* resultU8 = (uint64_t *)resultI1;
|
||||
float* resultF4 = (float *)resultI1;
|
||||
double* resultF8 = (double *)resultI1;
|
||||
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
|
||||
|
||||
// Initialize the result with the first device's array
|
||||
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
|
||||
ncclRedOp_t red_op = ((op == ncclAvg) ? ncclSum : op);
|
||||
|
||||
// Perform reduction on the other device arrays
|
||||
for (int i = 1; i < dataset.numDevices; i++)
|
||||
{
|
||||
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
|
||||
uint8_t* arrayU1 = (uint8_t *)arrayI1;
|
||||
int32_t* arrayI4 = (int32_t *)arrayI1;
|
||||
uint32_t* arrayU4 = (uint32_t *)arrayI1;
|
||||
int64_t* arrayI8 = (int64_t *)arrayI1;
|
||||
uint64_t* arrayU8 = (uint64_t *)arrayI1;
|
||||
float* arrayF4 = (float *)arrayI1;
|
||||
double* arrayF8 = (double *)arrayI1;
|
||||
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
|
||||
|
||||
for (int j = 0; j < dataset.numElements; j++)
|
||||
{
|
||||
switch (dataset.dataType)
|
||||
{
|
||||
case ncclInt8: resultI1[j] = ReduceOp(red_op, resultI1[j], arrayI1[j]); break;
|
||||
case ncclUint8: resultU1[j] = ReduceOp(red_op, resultU1[j], arrayU1[j]); break;
|
||||
case ncclInt32: resultI4[j] = ReduceOp(red_op, resultI4[j], arrayI4[j]); break;
|
||||
case ncclUint32: resultU4[j] = ReduceOp(red_op, resultU4[j], arrayU4[j]); break;
|
||||
case ncclInt64: resultI8[j] = ReduceOp(red_op, resultI8[j], arrayI8[j]); break;
|
||||
case ncclUint64: resultU8[j] = ReduceOp(red_op, resultU8[j], arrayU8[j]); break;
|
||||
case ncclFloat32: resultF4[j] = ReduceOp(red_op, resultF4[j], arrayF4[j]); break;
|
||||
case ncclFloat64: resultF8[j] = ReduceOp(red_op, resultF8[j], arrayF8[j]); break;
|
||||
case ncclBfloat16: resultB2[j] = ReduceOp(red_op, resultB2[j], arrayB2[j]); break;
|
||||
default:
|
||||
fprintf(stderr, "[ERROR] Unsupported datatype\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (op == ncclAvg)
|
||||
Average(dataset, resultI1);
|
||||
|
||||
// Copy results into expected arrays
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
memcpy(dataset.expected[i], resultI1, dataset.NumBytes());
|
||||
|
||||
free(resultI1);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,138 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_AllReduceAbort.hpp"
|
||||
#include "../include/comm.h"
|
||||
|
||||
#define NUM_ITER 8
|
||||
#define FAKE_OP_COUNT NUM_ITER+1
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
#define HIPCHECK(cmd) \
|
||||
do { \
|
||||
hipError_t error = (cmd); \
|
||||
if (error != hipSuccess) { \
|
||||
std::cerr << "Encountered HIP error (" << error << ") at line " \
|
||||
<< __LINE__ << " in file " << __FILE__ << "\n"; \
|
||||
exit(-1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
|
||||
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
|
||||
|
||||
TEST_P(AllReduceAbortTest, Correctness) {
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
|
||||
// Prepare input / output / expected results
|
||||
Dataset dataset;
|
||||
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
|
||||
FillDatasetWithPattern(dataset);
|
||||
|
||||
int gpu = 0; // GPU number to trigger abort
|
||||
ncclComm_t comm = comms[gpu];
|
||||
|
||||
HIPCHECK(hipSetDevice(gpu));
|
||||
hipStream_t stream;
|
||||
HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
|
||||
struct ncclChannel* channel = comm->channels;
|
||||
uint64_t **p_dev_head = (uint64_t **)((uint8_t*)(channel->devPeers + channel->ring.next) + offsetof(struct ncclPeer, send[0].conn.head));
|
||||
uint64_t *real_head, *fake_head, *fake_h;
|
||||
|
||||
// get original head
|
||||
HIPCHECK(hipMemcpy(&real_head, p_dev_head, sizeof(uint64_t*), hipMemcpyDefault));
|
||||
// allocate and install fakes
|
||||
HIPCHECK(hipHostMalloc(&fake_head, sizeof(uint64_t*), hipHostMallocMapped));
|
||||
HIPCHECK(hipMemcpy(p_dev_head, &fake_head, sizeof(uint64_t*), hipMemcpyDefault));
|
||||
*fake_head = 0;
|
||||
// read back fakes to confirm
|
||||
HIPCHECK(hipMemcpy(&fake_h, p_dev_head, sizeof(uint64_t*), hipMemcpyDefault));
|
||||
//std::cerr << "[ ] replaced gpu " << gpu << " real_opCount = " << real_opCount << " to fake_opCount = " << fake_o << std::endl;
|
||||
//std::cerr << "[ ] replaced gpu " << gpu << " real_head = " << real_head << " to fake_head = " << fake_h << std::endl;
|
||||
|
||||
// Perform a number of iterations and introduce abort
|
||||
for (int j = 0; j < NUM_ITER; j++) {
|
||||
//std::cerr << "[ ] iter = " << j << std::endl;
|
||||
// Start a group call
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++) {
|
||||
ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
|
||||
numElements, dataType, op, comms[i], streams[i]);
|
||||
}
|
||||
// Signal end of group call
|
||||
ncclGroupEnd();
|
||||
}
|
||||
|
||||
// Wait for reduction to complete
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
hipError_t hipErr;
|
||||
int remaining = numDevices;
|
||||
int* done = (int*)malloc(sizeof(int)*numDevices);
|
||||
memset(done, 0, sizeof(int)*numDevices);
|
||||
bool timeout = false, abort_called = false;
|
||||
while (remaining) {
|
||||
int idle = 1;
|
||||
for (int i=0; i<numDevices; i++) {
|
||||
if (done[i]) continue;
|
||||
|
||||
hipErr = hipStreamQuery(streams[i]);
|
||||
if (hipErr == hipSuccess) {
|
||||
done[i] = 1;
|
||||
remaining--;
|
||||
idle = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
#if NCCL_MAJOR >= 2
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
|
||||
auto delta = std::chrono::high_resolution_clock::now() - start;
|
||||
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
|
||||
if (deltaSec > 10.0 && !timeout) {
|
||||
std::cerr << "[ ] timeout condition, calling ncclCommAbort ... " << std::endl;
|
||||
timeout = true;
|
||||
}
|
||||
ncclResult_t ncclAsyncErr;
|
||||
ncclCommGetAsyncError(comms[i], &ncclAsyncErr);
|
||||
if ((ncclAsyncErr != ncclSuccess || timeout) && !abort_called) {
|
||||
// An asynchronous error happened. Stop the operation and destroy
|
||||
// the communicator
|
||||
std::cerr << "[ ] ncclAsyncErr = " << ncclAsyncErr << std::endl;
|
||||
for (int i=0; i<numDevices; i++)
|
||||
ncclCommAbort(comms[i]);
|
||||
// Abort the perf test
|
||||
abort_called = true;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
// We might want to let other threads (including NCCL threads) use the CPU.
|
||||
if (idle) pthread_yield();
|
||||
}
|
||||
|
||||
free(done);
|
||||
HIPCHECK(hipHostFree(fake_head));
|
||||
HIPCHECK(hipStreamDestroy(stream));
|
||||
dataset.Release();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(AllReduceAbortSweep,
|
||||
AllReduceAbortTest,
|
||||
testing::Combine(
|
||||
// Reduction operator
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclFloat32),
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2, 4),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,20 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_ALLREDUCE_HPP
|
||||
#define TEST_ALLREDUCE_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class AllReduceAbortTest : public CorrectnessTest
|
||||
{
|
||||
protected:
|
||||
public:
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,83 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_AllReduceGroup.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
// This tests aggregated AllReduce calls within a group
|
||||
TEST_P(AllReduceGroupCorrectnessTest, Correctness)
|
||||
{
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
|
||||
// Prepare input / output / expected results
|
||||
Dataset dataset1, dataset2, dataset3;
|
||||
dataset1.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
|
||||
dataset2.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
|
||||
dataset3.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
|
||||
FillDatasetWithPattern(dataset1);
|
||||
FillDatasetWithPattern(dataset2);
|
||||
FillDatasetWithPattern(dataset3);
|
||||
ComputeExpectedResults(dataset1, op);
|
||||
ComputeExpectedResults(dataset2, op);
|
||||
ComputeExpectedResults(dataset3, op);
|
||||
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclAllReduce(dataset1.inputs[i], dataset1.outputs[i], numElements, dataType, op, comms[i], streams[i]);
|
||||
ncclAllReduce(dataset2.inputs[i], dataset2.outputs[i], numElements, dataType, op, comms[i], streams[i]);
|
||||
ncclAllReduce(dataset3.inputs[i], dataset3.outputs[i], numElements, dataType, op, comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset1);
|
||||
ValidateResults(dataset2);
|
||||
ValidateResults(dataset3);
|
||||
|
||||
dataset1.Release();
|
||||
dataset2.Release();
|
||||
dataset3.Release();
|
||||
}
|
||||
#if defined(BUILD_ALLREDUCE_ONLY)
|
||||
INSTANTIATE_TEST_SUITE_P(AllReduceGroupCorrectnessSweep,
|
||||
AllReduceGroupCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclFloat32),
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
#else
|
||||
INSTANTIATE_TEST_SUITE_P(AllReduceGroupCorrectnessSweep,
|
||||
AllReduceGroupCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclFloat32, ncclFloat64),
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
#endif
|
||||
} // namespace
|
||||
@@ -1,79 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_ALLREDUCEGROUP_HPP
|
||||
#define TEST_ALLREDUCEGROUP_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class AllReduceGroupCorrectnessTest : public CorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op)
|
||||
{
|
||||
// Copy all inputs to expected arrays temporarily to perform reduction on host
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
|
||||
// Allocate temporary host array to accumulate results
|
||||
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
|
||||
uint8_t* resultU1 = (uint8_t *)resultI1;
|
||||
int32_t* resultI4 = (int32_t *)resultI1;
|
||||
uint32_t* resultU4 = (uint32_t *)resultI1;
|
||||
int64_t* resultI8 = (int64_t *)resultI1;
|
||||
uint64_t* resultU8 = (uint64_t *)resultI1;
|
||||
float* resultF4 = (float *)resultI1;
|
||||
double* resultF8 = (double *)resultI1;
|
||||
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
|
||||
|
||||
// Initialize the result with the first device's array
|
||||
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
|
||||
|
||||
// Perform reduction on the other device arrays
|
||||
for (int i = 1; i < dataset.numDevices; i++)
|
||||
{
|
||||
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
|
||||
uint8_t* arrayU1 = (uint8_t *)arrayI1;
|
||||
int32_t* arrayI4 = (int32_t *)arrayI1;
|
||||
uint32_t* arrayU4 = (uint32_t *)arrayI1;
|
||||
int64_t* arrayI8 = (int64_t *)arrayI1;
|
||||
uint64_t* arrayU8 = (uint64_t *)arrayI1;
|
||||
float* arrayF4 = (float *)arrayI1;
|
||||
double* arrayF8 = (double *)arrayI1;
|
||||
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
|
||||
|
||||
for (int j = 0; j < dataset.numElements; j++)
|
||||
{
|
||||
switch (dataset.dataType)
|
||||
{
|
||||
case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
|
||||
case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
|
||||
case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
|
||||
case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
|
||||
case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
|
||||
case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
|
||||
case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
|
||||
case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
|
||||
case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
|
||||
default:
|
||||
fprintf(stderr, "[ERROR] Unsupported datatype\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Copy results into expected arrays
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
memcpy(dataset.expected[i], resultI1, dataset.NumBytes());
|
||||
|
||||
free(resultI1);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,82 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "test_AllReduceGroupMultiProcess.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(AllReduceGroupMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
// Important: Make sure the order of ncclFunc_t's here match the order of ncclFunc_ts
|
||||
// as they appear in TestGroupCalls()
|
||||
std::vector<ncclFunc_t> ncclFuncs;
|
||||
ncclFuncs.push_back(ncclCollAllReduce);
|
||||
ncclFuncs.push_back(ncclCollAllReduce);
|
||||
ncclFuncs.push_back(ncclCollAllReduce);
|
||||
|
||||
// Create multiple datasets for combined operation
|
||||
std::vector<Dataset*> datasets(ncclFuncs.size());
|
||||
for (int i = 0; i < datasets.size(); i++)
|
||||
{
|
||||
datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
|
||||
datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
|
||||
}
|
||||
|
||||
int const numGpusPerProcess = 2;
|
||||
int const numProcesses = numDevices / numGpusPerProcess;
|
||||
std::vector<int> pids(numProcesses);
|
||||
int process = -1;
|
||||
|
||||
for (int i = 0; i < numDevices; i+= numGpusPerProcess)
|
||||
{
|
||||
process++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
int gpuIdx = i;
|
||||
int maxIdx = gpuIdx + (numGpusPerProcess - 1) >= numDevices ? numDevices : gpuIdx + numGpusPerProcess;
|
||||
|
||||
std::vector<int> ranks;
|
||||
for (; gpuIdx < maxIdx; gpuIdx++)
|
||||
{
|
||||
ranks.push_back(gpuIdx);
|
||||
}
|
||||
|
||||
bool pass;
|
||||
TestGroupCalls(process, ranks, datasets, ncclFuncs, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
pids[process] = pid;
|
||||
}
|
||||
}
|
||||
|
||||
ValidateProcesses(pids);
|
||||
|
||||
for (int i = 0; i < datasets.size(); i++)
|
||||
{
|
||||
datasets[i]->ReleaseRootProcess();
|
||||
munmap(datasets[i], sizeof(Dataset));
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(AllReduceGroupMultiProcessCorrectnessSweep,
|
||||
AllReduceGroupMultiProcessCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator (not used)
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclFloat32,
|
||||
ncclFloat64),
|
||||
// Number of elements
|
||||
testing::Values(3072, 3145728),
|
||||
// Number of devices
|
||||
testing::Values(4,8),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,105 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef TEST_ALLREDUCEGROUP_MULTI_PROCESS_HPP
|
||||
#define TEST_ALLREDUCEGROUP_MULTI_PROCESS_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
#include "test_AllReduceMultiProcess.hpp"
|
||||
#include <string>
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class AllReduceGroupMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
void TestGroupCalls(int process, std::vector<int> const& ranks, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs, bool& pass)
|
||||
{
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
SetUpPerProcess(ranks[i], funcs, comms[ranks[i]], streams[ranks[i]], datasets);
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
int numProcesses = numDevices / ranks.size();
|
||||
Barrier barrier(process, numProcesses, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
|
||||
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
for (int j = 0; j < datasets.size(); j++)
|
||||
{
|
||||
FillDatasetWithPattern(*datasets[j], ranks[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int const root = 0;
|
||||
|
||||
for (int i = 0; i < 3; i++)
|
||||
{
|
||||
AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[i], barrier, op, ranks);
|
||||
}
|
||||
barrier.Wait();
|
||||
|
||||
size_t const byteCount = datasets[0]->NumBytes() / numDevices;
|
||||
size_t const elemCount = numElements / numDevices;
|
||||
|
||||
ncclGroupStart();
|
||||
// AllReduce
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
for (int j = 0; j < 3; j++)
|
||||
{
|
||||
ncclAllReduce(datasets[j]->inputs[rank], datasets[j]->outputs[rank],
|
||||
numElements, dataType, op, comms[rank], streams[rank]);
|
||||
}
|
||||
}
|
||||
// Signal end of group call
|
||||
ncclGroupEnd();
|
||||
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
HIP_CALL(hipSetDevice(ranks[i]));
|
||||
HIP_CALL(hipStreamSynchronize(streams[ranks[i]]));
|
||||
}
|
||||
|
||||
for (int i = 0; i < funcs.size(); i++)
|
||||
{
|
||||
for (int j = 0; j < ranks.size(); j++)
|
||||
{
|
||||
pass = ValidateResults(*datasets[i], ranks[j], root);
|
||||
if (!pass)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
barrier.Wait();
|
||||
for (int j = 0; j < ranks.size(); j++)
|
||||
{
|
||||
datasets[i]->Release(ranks[j]);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
TearDownPerProcess(comms[ranks[i]], streams[ranks[i]]);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,61 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_AllReduceMultiProcess.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(AllReduceMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
bool pass;
|
||||
TestAllReduce(gpu, *dataset, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
}
|
||||
|
||||
ValidateProcesses(pids);
|
||||
dataset->ReleaseRootProcess();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(AllReduceMultiProcessCorrectnessSweep,
|
||||
AllReduceMultiProcessCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator
|
||||
testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,117 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_ALLREDUCE_MULTI_PROCESS_HPP
|
||||
#define TEST_ALLREDUCE_MULTI_PROCESS_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class AllReduceMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, std::vector<int> const& ranks)
|
||||
{
|
||||
// Copy all inputs to expected arrays temporarily to perform reduction on host
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
barrier.Wait();
|
||||
|
||||
// Allocate temporary host array to accumulate results
|
||||
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
|
||||
uint8_t* resultU1 = (uint8_t *)resultI1;
|
||||
int32_t* resultI4 = (int32_t *)resultI1;
|
||||
uint32_t* resultU4 = (uint32_t *)resultI1;
|
||||
int64_t* resultI8 = (int64_t *)resultI1;
|
||||
uint64_t* resultU8 = (uint64_t *)resultI1;
|
||||
float* resultF4 = (float *)resultI1;
|
||||
double* resultF8 = (double *)resultI1;
|
||||
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
|
||||
|
||||
// Initialize the result with the first device's array
|
||||
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
|
||||
barrier.Wait();
|
||||
|
||||
// Perform reduction
|
||||
for (int i = 1; i < dataset.numDevices; i++)
|
||||
{
|
||||
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
|
||||
uint8_t* arrayU1 = (uint8_t *)arrayI1;
|
||||
int32_t* arrayI4 = (int32_t *)arrayI1;
|
||||
uint32_t* arrayU4 = (uint32_t *)arrayI1;
|
||||
int64_t* arrayI8 = (int64_t *)arrayI1;
|
||||
uint64_t* arrayU8 = (uint64_t *)arrayI1;
|
||||
float* arrayF4 = (float *)arrayI1;
|
||||
double* arrayF8 = (double *)arrayI1;
|
||||
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
|
||||
|
||||
for (int j = 0; j < dataset.numElements; j++)
|
||||
{
|
||||
switch (dataset.dataType)
|
||||
{
|
||||
case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
|
||||
case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
|
||||
case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
|
||||
case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
|
||||
case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
|
||||
case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
|
||||
case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
|
||||
case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
|
||||
case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
|
||||
default:
|
||||
fprintf(stderr, "[ERROR] Unsupported datatype\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier.Wait();
|
||||
|
||||
// Copy results into expected array
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
memcpy(dataset.expected[rank], resultI1, dataset.NumBytes());
|
||||
}
|
||||
free(resultI1);
|
||||
}
|
||||
|
||||
void TestAllReduce(int rank, Dataset& dataset, bool& pass)
|
||||
{
|
||||
SetUpPerProcess(rank, ncclCollAllReduce, comms[rank], streams[rank], dataset);
|
||||
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
|
||||
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset, rank);
|
||||
ComputeExpectedResults(dataset, barrier, op, std::vector<int>(1, rank));
|
||||
|
||||
// Launch the reduction
|
||||
ncclAllReduce(dataset.inputs[rank], dataset.outputs[rank],
|
||||
numElements, dataType, op, comms[rank], streams[rank]);
|
||||
|
||||
// Wait for reduction to complete
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
|
||||
// Check results
|
||||
pass = ValidateResults(dataset, rank);
|
||||
|
||||
TearDownPerProcess(comms[rank], streams[rank]);
|
||||
dataset.Release(rank);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,67 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_AllToAll.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(AllToAllCorrectnessTest, Correctness)
|
||||
{
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
|
||||
// Allocate data
|
||||
Dataset dataset;
|
||||
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllToAll);
|
||||
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset);
|
||||
ComputeExpectedResults(dataset);
|
||||
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclAllToAll(dataset.inputs[i],
|
||||
dataset.outputs[i],
|
||||
numElements, dataType,
|
||||
comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset);
|
||||
|
||||
dataset.Release();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(AllToAllCorrectnessSweep,
|
||||
AllToAllCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator is not used
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,26 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_ALLTOALL_HPP
|
||||
#define TEST_ALLTOALL_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class AllToAllCorrectnessTest : public CorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset)
|
||||
{
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
for (int j = 0; j < dataset.numDevices; j++)
|
||||
HIP_CALL(hipMemcpy((int8_t *)dataset.expected[i]+dataset.NumBytes()*j, (int8_t *)dataset.inputs[j]+dataset.NumBytes()*i,
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,61 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_AllToAllMultiProcess.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(AllToAllMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllToAll);
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
bool pass;
|
||||
TestAllToAll(gpu, *dataset, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
}
|
||||
|
||||
ValidateProcesses(pids);
|
||||
dataset->ReleaseRootProcess();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(AllToAllMultiProcessCorrectnessSweep,
|
||||
AllToAllMultiProcessCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator is not used
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,61 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_ALLTOALL_MULTI_PROCESS_HPP
|
||||
#define TEST_ALLTOALL_MULTI_PROCESS_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class AllToAllMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, std::vector<int> const& ranks)
|
||||
{
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
for (int j = 0; j < dataset.numDevices; j++)
|
||||
{
|
||||
HIP_CALL(hipMemcpy((int8_t *)dataset.expected[j]+dataset.NumBytes()*rank, (int8_t *)dataset.inputs[rank]+dataset.NumBytes()*j,
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestAllToAll(int rank, Dataset& dataset, bool& pass)
|
||||
{
|
||||
SetUpPerProcess(rank, ncclCollAllToAll, comms[rank], streams[rank], dataset);
|
||||
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset, rank);
|
||||
ComputeExpectedResults(dataset, std::vector<int>(1, rank));
|
||||
|
||||
// Launch the reduction
|
||||
ncclAllToAll(dataset.inputs[rank],
|
||||
dataset.outputs[rank],
|
||||
numElements, dataType,
|
||||
comms[rank], streams[rank]);
|
||||
|
||||
// Wait for reduction to complete
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
|
||||
// Check results
|
||||
pass = ValidateResults(dataset, rank);
|
||||
|
||||
TearDownPerProcess(comms[rank], streams[rank]);
|
||||
dataset.Release(rank);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,75 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_AllToAllv.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(AllToAllvCorrectnessTest, Correctness)
|
||||
{
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
|
||||
// Allocate data
|
||||
Dataset dataset;
|
||||
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllToAll);
|
||||
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset);
|
||||
ComputeExpectedResults(dataset);
|
||||
|
||||
size_t chunksize = numElements*2/numDevices;
|
||||
#define MAX_ALLTOALLV_RANKS 16
|
||||
static size_t sendcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS];
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
ncclGroupStart();
|
||||
for (int r = 0; r < numDevices; r++) {
|
||||
size_t disp = 0;
|
||||
for (int i = 0; i < numDevices; i++) {
|
||||
size_t scount = ((i+r)%numDevices)*chunksize;
|
||||
if (i+r == numDevices-1)
|
||||
scount += (numElements*numDevices-chunksize*(numDevices-1)*numDevices/2);
|
||||
sendcounts[i+r*MAX_ALLTOALLV_RANKS] = recvcounts[i+r*MAX_ALLTOALLV_RANKS] = scount;
|
||||
sdispls[i+r*MAX_ALLTOALLV_RANKS] = rdispls[i+r*MAX_ALLTOALLV_RANKS] = disp;
|
||||
disp += scount;
|
||||
}
|
||||
ncclAllToAllv((char*)dataset.inputs[r], sendcounts+r*MAX_ALLTOALLV_RANKS, sdispls+r*MAX_ALLTOALLV_RANKS,
|
||||
(char*)dataset.outputs[r], recvcounts+r*MAX_ALLTOALLV_RANKS, rdispls+r*MAX_ALLTOALLV_RANKS, dataType, comms[r], streams[r]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset);
|
||||
|
||||
dataset.Release();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(AllToAllvCorrectnessSweep,
|
||||
AllToAllvCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator is not used
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(2520, 3026520),
|
||||
// Number of devices
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,44 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_ALLTOALLV_HPP
|
||||
#define TEST_ALLTOALLV_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class AllToAllvCorrectnessTest : public CorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset)
|
||||
{
|
||||
for (int i = 0; i < dataset.numDevices; i++) {
|
||||
size_t rdisp = 0;
|
||||
size_t chunksize = dataset.numElements*2/dataset.numDevices;
|
||||
for (int j = 0; j < dataset.numDevices; j++) {
|
||||
size_t scount = 0, rcount = ((j+i)%dataset.numDevices)*chunksize;
|
||||
if (j+i == dataset.numDevices-1)
|
||||
rcount += (dataset.numElements*dataset.numDevices-chunksize*(dataset.numDevices-1)*dataset.numDevices/2);
|
||||
size_t sdisp = 0;
|
||||
for (int k=0; k<dataset.numDevices; k++) {
|
||||
scount = ((k+j)%dataset.numDevices)*chunksize;
|
||||
if (k+j == dataset.numDevices-1)
|
||||
scount += (dataset.numElements*dataset.numDevices-chunksize*(dataset.numDevices-1)*dataset.numDevices/2);
|
||||
if (k == i)
|
||||
break;
|
||||
sdisp += scount;
|
||||
}
|
||||
HIP_CALL(hipMemcpy((int8_t *)dataset.expected[i]+rdisp*DataTypeToBytes(dataset.dataType),
|
||||
(int8_t *)dataset.inputs[j]+sdisp*DataTypeToBytes(dataset.dataType),
|
||||
rcount*DataTypeToBytes(dataset.dataType), hipMemcpyDeviceToHost));
|
||||
rdisp += rcount;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,71 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_Broadcast.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(BroadcastCorrectnessTest, Correctness)
|
||||
{
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
|
||||
// Allocate data
|
||||
Dataset dataset;
|
||||
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollBroadcast);
|
||||
|
||||
// Test each possible root
|
||||
for (int root = 0; root < numDevices; root++)
|
||||
{
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset);
|
||||
ComputeExpectedResults(dataset, root);
|
||||
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclBroadcast(dataset.inputs[i],
|
||||
dataset.outputs[i],
|
||||
numElements, dataType,
|
||||
root, comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset);
|
||||
}
|
||||
|
||||
dataset.Release();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(BroadcastCorrectnessSweep,
|
||||
BroadcastCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator is not used
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,25 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_BROADCAST_HPP
|
||||
#define TEST_BROADCAST_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class BroadcastCorrectnessTest : public CorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, int const root)
|
||||
{
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[root],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,140 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_BroadcastAbort.hpp"
|
||||
#include "../include/comm.h"
|
||||
|
||||
#define NUM_ITER 8
|
||||
#define FAKE_OP_COUNT NUM_ITER+1
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
#define HIPCHECK(cmd) \
|
||||
do { \
|
||||
hipError_t error = (cmd); \
|
||||
if (error != hipSuccess) { \
|
||||
std::cerr << "Encountered HIP error (" << error << ") at line " \
|
||||
<< __LINE__ << " in file " << __FILE__ << "\n"; \
|
||||
exit(-1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
|
||||
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
|
||||
|
||||
TEST_P(BroadcastAbortTest, Correctness) {
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
|
||||
// Prepare input / output / expected results
|
||||
Dataset dataset;
|
||||
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollBroadcast);
|
||||
FillDatasetWithPattern(dataset);
|
||||
|
||||
int root = 0;
|
||||
int gpu = 0; // GPU number to trigger abort
|
||||
ncclComm_t comm = comms[gpu];
|
||||
|
||||
HIPCHECK(hipSetDevice(gpu));
|
||||
hipStream_t stream;
|
||||
HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
|
||||
struct ncclChannel* channel = comm->channels;
|
||||
uint64_t **p_dev_head = (uint64_t **)((uint8_t*)(channel->devPeers + channel->ring.next) + offsetof(struct ncclPeer, send[0].conn.head));
|
||||
uint64_t *real_head, *fake_head, *fake_h;
|
||||
|
||||
// get original head
|
||||
HIPCHECK(hipMemcpy(&real_head, p_dev_head, sizeof(uint64_t*), hipMemcpyDefault));
|
||||
// allocate and install fakes
|
||||
HIPCHECK(hipHostMalloc(&fake_head, sizeof(uint64_t*), hipHostMallocMapped));
|
||||
HIPCHECK(hipMemcpy(p_dev_head, &fake_head, sizeof(uint64_t*), hipMemcpyDefault));
|
||||
*fake_head = 0;
|
||||
// read back fakes to confirm
|
||||
HIPCHECK(hipMemcpy(&fake_h, p_dev_head, sizeof(uint64_t*), hipMemcpyDefault));
|
||||
//std::cerr << "[ ] replaced gpu " << gpu << " real_head = " << real_head << " to fake_head = " << fake_h << std::endl;
|
||||
|
||||
// Perform a number of iterations and introduce abort
|
||||
for (int j = 0; j < NUM_ITER; j++) {
|
||||
//std::cerr << "[ ] iter = " << j << std::endl;
|
||||
// Start a group call
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++) {
|
||||
ncclBroadcast(dataset.inputs[i],
|
||||
dataset.outputs[i],
|
||||
numElements, dataType,
|
||||
root, comms[i], streams[i]);
|
||||
}
|
||||
// Signal end of group call
|
||||
ncclGroupEnd();
|
||||
}
|
||||
|
||||
// Wait for reduction to complete
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
hipError_t hipErr;
|
||||
int remaining = numDevices;
|
||||
int* done = (int*)malloc(sizeof(int)*numDevices);
|
||||
memset(done, 0, sizeof(int)*numDevices);
|
||||
bool timeout = false, abort_called = false;
|
||||
while (remaining) {
|
||||
int idle = 1;
|
||||
for (int i=0; i<numDevices; i++) {
|
||||
if (done[i]) continue;
|
||||
|
||||
hipErr = hipStreamQuery(streams[i]);
|
||||
if (hipErr == hipSuccess) {
|
||||
done[i] = 1;
|
||||
remaining--;
|
||||
idle = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
#if NCCL_MAJOR >= 2
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
|
||||
auto delta = std::chrono::high_resolution_clock::now() - start;
|
||||
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
|
||||
if (deltaSec > 10.0 && !timeout) {
|
||||
std::cerr << "[ ] timeout condition, calling ncclCommAbort ... " << std::endl;
|
||||
timeout = true;
|
||||
}
|
||||
ncclResult_t ncclAsyncErr;
|
||||
ncclCommGetAsyncError(comms[i], &ncclAsyncErr);
|
||||
if ((ncclAsyncErr != ncclSuccess || timeout) && !abort_called) {
|
||||
// An asynchronous error happened. Stop the operation and destroy
|
||||
// the communicator
|
||||
std::cerr << "[ ] ncclAsyncErr = " << ncclAsyncErr << std::endl;
|
||||
for (int i=0; i<numDevices; i++)
|
||||
ncclCommAbort(comms[i]);
|
||||
// Abort the perf test
|
||||
abort_called = true;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
// We might want to let other threads (including NCCL threads) use the CPU.
|
||||
if (idle) pthread_yield();
|
||||
}
|
||||
|
||||
free(done);
|
||||
HIPCHECK(hipHostFree(fake_head));
|
||||
HIPCHECK(hipStreamDestroy(stream));
|
||||
dataset.Release();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(BroadcastAbortSweep,
|
||||
BroadcastAbortTest,
|
||||
testing::Combine(
|
||||
// Reduction operator
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclFloat32),
|
||||
// Number of elements
|
||||
testing::Values(1048576),
|
||||
// Number of devices
|
||||
testing::Values(2, 4),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,20 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_ALLREDUCE_HPP
|
||||
#define TEST_ALLREDUCE_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class BroadcastAbortTest : public CorrectnessTest
|
||||
{
|
||||
protected:
|
||||
public:
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,68 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_BroadcastMultiProcess.hpp"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
#include <iostream>
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(BroadcastMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollBroadcast);
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
bool pass;
|
||||
TestBroadcast(gpu, *dataset, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
}
|
||||
|
||||
ValidateProcesses(pids);
|
||||
dataset->ReleaseRootProcess();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(BroadcastMultiProcessCorrectnessSweep,
|
||||
BroadcastMultiProcessCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator is not used
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,77 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_BROADCAST_MULTI_PROCESS_HPP
|
||||
#define TEST_BROADCAST_MULTI_PROCESS_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class BroadcastMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, int const root, std::vector<int> const& ranks)
|
||||
{
|
||||
for (int h = 0; h < ranks.size(); h++)
|
||||
{
|
||||
int rank = ranks[h];
|
||||
// Root has the answer; share it via host memcpy's
|
||||
if (rank == root)
|
||||
{
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
{
|
||||
if (i == rank) continue;
|
||||
memcpy(dataset.expected[i], dataset.expected[root], dataset.NumBytes());
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestBroadcast(int rank, Dataset& dataset, bool& pass)
|
||||
{
|
||||
SetUpPerProcess(rank, ncclCollBroadcast, comms[rank], streams[rank], dataset);
|
||||
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
|
||||
|
||||
// Test each possible root
|
||||
for (int root = 0; root < numDevices; root++)
|
||||
{
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset, rank);
|
||||
ComputeExpectedResults(dataset, root, std::vector<int>(1, rank));
|
||||
|
||||
// Launch the reduction (1 process per GPU)
|
||||
ncclResult_t res = ncclBroadcast(dataset.inputs[rank],
|
||||
dataset.outputs[rank],
|
||||
numElements, dataType,
|
||||
root, comms[rank], streams[rank]);
|
||||
|
||||
// Wait for reduction to complete
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
|
||||
// Check results
|
||||
pass = ValidateResults(dataset, rank);
|
||||
|
||||
// Ensure all processes have finished current iteration before proceeding
|
||||
barrier.Wait();
|
||||
}
|
||||
|
||||
TearDownPerProcess(comms[rank], streams[rank]);
|
||||
dataset.Release(rank);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,129 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "test_CombinedCalls.hpp"
|
||||
|
||||
#include "test_AllGather.hpp"
|
||||
#include "test_AllReduce.hpp"
|
||||
#include "test_Broadcast.hpp"
|
||||
#include "test_Reduce.hpp"
|
||||
#include "test_ReduceScatter.hpp"
|
||||
#include "test_Scatter.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(CombinedCallsCorrectnessTest, Correctness)
|
||||
{
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
|
||||
// Create multiple datasets for combined operation
|
||||
std::vector<Dataset> datasets(5);
|
||||
std::vector<ncclFunc_t> ncclFuncs(5);
|
||||
ncclFuncs.push_back(ncclCollAllGather);
|
||||
ncclFuncs.push_back(ncclCollAllReduce);
|
||||
ncclFuncs.push_back(ncclCollBroadcast);
|
||||
ncclFuncs.push_back(ncclCollReduce);
|
||||
ncclFuncs.push_back(ncclCollReduceScatter);
|
||||
|
||||
// Adjust numElements to be multiple of numDevices
|
||||
numElements = (numElements/numDevices)*numDevices;
|
||||
for (int i = 0; i < datasets.size(); i++)
|
||||
{
|
||||
datasets[i].Initialize(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
|
||||
FillDatasetWithPattern(datasets[i]);
|
||||
}
|
||||
|
||||
Dataset scatter_dataset;
|
||||
scatter_dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollScatter);
|
||||
FillDatasetWithPattern(scatter_dataset);
|
||||
|
||||
// Compute expected results for each dataset in combined
|
||||
int const root = 0;
|
||||
AllGatherCorrectnessTest::ComputeExpectedResults(datasets[0]);
|
||||
AllReduceCorrectnessTest::ComputeExpectedResults(datasets[1], op);
|
||||
BroadcastCorrectnessTest::ComputeExpectedResults(datasets[2], root);
|
||||
ReduceCorrectnessTest::ComputeExpectedResults(datasets[3], op, root);
|
||||
ReduceScatterCorrectnessTest::ComputeExpectedResults(datasets[4], op);
|
||||
ScatterCorrectnessTest::ComputeExpectedResults(scatter_dataset, root);
|
||||
|
||||
size_t const byteCount = datasets[0].NumBytes() / numDevices;
|
||||
size_t const elemCount = numElements / numDevices;
|
||||
|
||||
for (int j = 0; j < 10; j++)
|
||||
{
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclScatter(scatter_dataset.inputs[i],
|
||||
scatter_dataset.outputs[i],
|
||||
numElements, dataType,
|
||||
root, comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
|
||||
datasets[0].outputs[i], elemCount,
|
||||
dataType, comms[i], streams[i]);
|
||||
|
||||
ncclAllReduce(datasets[1].inputs[i], datasets[1].outputs[i],
|
||||
numElements, dataType, op, comms[i], streams[i]);
|
||||
|
||||
ncclBroadcast(datasets[2].inputs[i],
|
||||
datasets[2].outputs[i],
|
||||
numElements, dataType,
|
||||
root, comms[i], streams[i]);
|
||||
|
||||
ncclReduce(datasets[3].inputs[i],
|
||||
datasets[3].outputs[i],
|
||||
numElements, dataType, op,
|
||||
root, comms[i], streams[i]);
|
||||
|
||||
ncclReduceScatter(datasets[4].inputs[i],
|
||||
(int8_t *)datasets[4].outputs[i] + (i * byteCount),
|
||||
elemCount, dataType, op,
|
||||
comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
// Check results for each collective in the combined
|
||||
for (int i = 0; i < 5; i++)
|
||||
ValidateResults(datasets[i]);
|
||||
|
||||
ValidateResults(scatter_dataset);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 5; i++)
|
||||
datasets[i].Release();
|
||||
scatter_dataset.Release();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(CombinedCallsCorrectnessSweep,
|
||||
CombinedCallsCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator (not used)
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(2520, 3026520),
|
||||
// Number of devices
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1", "RCCL_P2P_NET_DISABLE=0", "RCCL_P2P_NET_DISABLE=1")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,17 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef TEST_COMBINEDCALLS_HPP
|
||||
#define TEST_COMBINEDCALLS_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class CombinedCallsCorrectnessTest : public CorrectnessTest {};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,81 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "test_CombinedCallsMultiProcess.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(CombinedCallsMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
// Important: Make sure the order of ncclFunc_t's here match the order of ncclFunc_ts
|
||||
// as they appear in TestCombinedCalls()
|
||||
std::vector<ncclFunc_t> ncclFuncs;
|
||||
ncclFuncs.push_back(ncclCollAllGather);
|
||||
ncclFuncs.push_back(ncclCollAllReduce);
|
||||
ncclFuncs.push_back(ncclCollBroadcast);
|
||||
ncclFuncs.push_back(ncclCollReduce);
|
||||
ncclFuncs.push_back(ncclCollReduceScatter);
|
||||
|
||||
// Create multiple datasets for combined operation
|
||||
std::vector<Dataset*> datasets(ncclFuncs.size());
|
||||
for (int i = 0; i < datasets.size(); i++)
|
||||
{
|
||||
datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
|
||||
datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
|
||||
}
|
||||
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
bool pass;
|
||||
TestCombinedCalls(gpu, datasets, ncclFuncs, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
}
|
||||
|
||||
ValidateProcesses(pids);
|
||||
|
||||
for (int i = 0; i < datasets.size(); i++)
|
||||
{
|
||||
datasets[i]->ReleaseRootProcess();
|
||||
munmap(datasets[i], sizeof(Dataset));
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(CombinedCallsMultiProcessCorrectnessSweep,
|
||||
CombinedCallsMultiProcessCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator (not used)
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(3072, 3145728),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,97 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef TEST_COMBINEDCALLS_MULTI_PROCESS_HPP
|
||||
#define TEST_COMBINEDCALLS_MULTI_PROCESS_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
#include "test_AllGatherMultiProcess.hpp"
|
||||
#include "test_AllReduceMultiProcess.hpp"
|
||||
#include "test_BroadcastMultiProcess.hpp"
|
||||
#include "test_ReduceMultiProcess.hpp"
|
||||
#include "test_ReduceScatterMultiProcess.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class CombinedCallsMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
void TestCombinedCalls(int rank, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs, bool& pass)
|
||||
{
|
||||
SetUpPerProcess(rank, funcs, comms[rank], streams[rank], datasets);
|
||||
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
|
||||
|
||||
// Compute expected results for each dataset in combined
|
||||
int const root = 0;
|
||||
std::vector<int> ranks(1, rank);
|
||||
AllGatherMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[0], barrier, numDevices, ranks);
|
||||
AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[1], barrier, op, ranks);
|
||||
BroadcastMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[2], root, ranks);
|
||||
ReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[3], barrier, op, root, ranks);
|
||||
ReduceScatterMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[4], barrier, op, ranks);
|
||||
|
||||
size_t const byteCount = datasets[0]->NumBytes() / numDevices;
|
||||
size_t const elemCount = numElements / numDevices;
|
||||
|
||||
ncclAllGather((int8_t *)datasets[0]->inputs[rank] + (rank * byteCount),
|
||||
datasets[0]->outputs[rank], elemCount,
|
||||
dataType, comms[rank], streams[rank]);
|
||||
|
||||
ncclAllReduce(datasets[1]->inputs[rank], datasets[1]->outputs[rank],
|
||||
numElements, dataType, op, comms[rank], streams[rank]);
|
||||
|
||||
ncclBroadcast(datasets[2]->inputs[rank],
|
||||
datasets[2]->outputs[rank],
|
||||
numElements, dataType,
|
||||
root, comms[rank], streams[rank]);
|
||||
|
||||
ncclReduce(datasets[3]->inputs[rank],
|
||||
datasets[3]->outputs[rank],
|
||||
numElements, dataType, op,
|
||||
root, comms[rank], streams[rank]);
|
||||
|
||||
ncclReduceScatter(datasets[4]->inputs[rank],
|
||||
(int8_t *)datasets[4]->outputs[rank] + (rank * byteCount),
|
||||
elemCount, dataType, op,
|
||||
comms[rank], streams[rank]);
|
||||
|
||||
// Wait for reduction to complete
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
|
||||
for (int i = 0; i < funcs.size(); i++)
|
||||
{
|
||||
for (int j = 0; j < ranks.size(); j++)
|
||||
{
|
||||
pass = ValidateResults(*datasets[i], ranks[j], root);
|
||||
if (!pass)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
barrier.Wait();
|
||||
for (int j = 0; j < ranks.size(); j++)
|
||||
{
|
||||
datasets[i]->Release(ranks[j]);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
TearDownPerProcess(comms[ranks[i]], streams[ranks[i]]);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,71 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_Gather.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(GatherCorrectnessTest, Correctness)
|
||||
{
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
|
||||
// Allocate data
|
||||
Dataset dataset;
|
||||
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollGather);
|
||||
|
||||
// Test each possible root
|
||||
for (int root = 0; root < numDevices; root++)
|
||||
{
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset);
|
||||
ComputeExpectedResults(dataset, root);
|
||||
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclGather(dataset.inputs[i],
|
||||
dataset.outputs[i],
|
||||
numElements, dataType,
|
||||
root, comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset, root);
|
||||
}
|
||||
|
||||
dataset.Release();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(GatherCorrectnessSweep,
|
||||
GatherCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator is not used
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,25 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_GATHER_HPP
|
||||
#define TEST_GATHER_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class GatherCorrectnessTest : public CorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, int const root)
|
||||
{
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
HIP_CALL(hipMemcpy((int8_t *)dataset.expected[root]+dataset.NumBytes()*i, dataset.inputs[i],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,61 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_GatherMultiProcess.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(GatherMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollGather);
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
bool pass;
|
||||
TestGather(gpu, *dataset, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
}
|
||||
|
||||
ValidateProcesses(pids);
|
||||
dataset->ReleaseRootProcess();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(GatherMultiProcessCorrectnessSweep,
|
||||
GatherMultiProcessCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator is not used
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,63 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_GATHER_MULTI_PROCESS_HPP
|
||||
#define TEST_GATHER_MULTI_PROCESS_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class GatherMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, int const root, int const rank)
|
||||
{
|
||||
HIP_CALL(hipMemcpy((int8_t *)dataset.expected[root]+dataset.NumBytes()*rank, dataset.inputs[rank],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
void TestGather(int rank, Dataset& dataset, bool& pass)
|
||||
{
|
||||
SetUpPerProcess(rank, ncclCollGather, comms[rank], streams[rank], dataset);
|
||||
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
|
||||
|
||||
// Test each possible root
|
||||
for (int root = 0; root < numDevices; root++)
|
||||
{
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset, rank);
|
||||
ComputeExpectedResults(dataset, root, rank);
|
||||
|
||||
// Launch the reduction (1 process per GPU)
|
||||
ncclGather(dataset.inputs[rank],
|
||||
dataset.outputs[rank],
|
||||
numElements, dataType,
|
||||
root, comms[rank], streams[rank]);
|
||||
|
||||
// Wait for reduction to complete
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
|
||||
// Check results
|
||||
pass = ValidateResults(dataset, rank, root);
|
||||
|
||||
// Ensure all processes have finished current iteration before proceeding
|
||||
barrier.Wait();
|
||||
}
|
||||
|
||||
TearDownPerProcess(comms[rank], streams[rank]);
|
||||
dataset.Release(rank);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,130 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "test_GroupCalls.hpp"
|
||||
|
||||
#include "test_AllGather.hpp"
|
||||
#include "test_AllReduce.hpp"
|
||||
#include "test_Broadcast.hpp"
|
||||
#include "test_Reduce.hpp"
|
||||
#include "test_ReduceScatter.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(GroupCallsCorrectnessTest, Correctness)
|
||||
{
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
|
||||
// Create multiple datasets for group operation
|
||||
std::vector<Dataset> datasets(5);
|
||||
std::vector<ncclFunc_t> ncclFuncs(5);
|
||||
ncclFuncs.push_back(ncclCollAllGather);
|
||||
ncclFuncs.push_back(ncclCollAllReduce);
|
||||
ncclFuncs.push_back(ncclCollBroadcast);
|
||||
ncclFuncs.push_back(ncclCollReduce);
|
||||
ncclFuncs.push_back(ncclCollReduceScatter);
|
||||
|
||||
// Adjust numElements to be multiple of numDevices
|
||||
numElements = (numElements/numDevices)*numDevices;
|
||||
for (int i = 0; i < datasets.size(); i++)
|
||||
{
|
||||
datasets[i].Initialize(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
|
||||
FillDatasetWithPattern(datasets[i]);
|
||||
}
|
||||
|
||||
// Compute expected results for each dataset in group
|
||||
int const root = 0;
|
||||
AllGatherCorrectnessTest::ComputeExpectedResults(datasets[0]);
|
||||
AllReduceCorrectnessTest::ComputeExpectedResults(datasets[1], op);
|
||||
BroadcastCorrectnessTest::ComputeExpectedResults(datasets[2], root);
|
||||
ReduceCorrectnessTest::ComputeExpectedResults(datasets[3], op, root);
|
||||
ReduceScatterCorrectnessTest::ComputeExpectedResults(datasets[4], op);
|
||||
|
||||
// Start a group call
|
||||
ncclGroupStart();
|
||||
|
||||
// AllGather
|
||||
size_t const byteCount = datasets[0].NumBytes() / numDevices;
|
||||
size_t const elemCount = numElements / numDevices;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
|
||||
datasets[0].outputs[i], elemCount,
|
||||
dataType, comms[i], streams[i]);
|
||||
}
|
||||
|
||||
// AllReduce
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclAllReduce(datasets[1].inputs[i], datasets[1].outputs[i],
|
||||
numElements, dataType, op, comms[i], streams[i]);
|
||||
}
|
||||
|
||||
// Broadcast
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclBroadcast(datasets[2].inputs[i],
|
||||
datasets[2].outputs[i],
|
||||
numElements, dataType,
|
||||
root, comms[i], streams[i]);
|
||||
}
|
||||
|
||||
// Reduce
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclReduce(datasets[3].inputs[i],
|
||||
datasets[3].outputs[i],
|
||||
numElements, dataType, op,
|
||||
root, comms[i], streams[i]);
|
||||
}
|
||||
|
||||
// ReduceScatter
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclReduceScatter(datasets[4].inputs[i],
|
||||
(int8_t *)datasets[4].outputs[i] + (i * byteCount),
|
||||
elemCount, dataType, op,
|
||||
comms[i], streams[i]);
|
||||
}
|
||||
|
||||
// Signal end of group call
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
// Check results for each collective in the group
|
||||
for (int i = 0; i < 5; i++)
|
||||
{
|
||||
ValidateResults(datasets[i]);
|
||||
datasets[i].Release();
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(GroupCallsCorrectnessSweep,
|
||||
GroupCallsCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator (not used)
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(2520, 3026520),
|
||||
// Number of devices
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,17 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef TEST_GROUPCALLS_HPP
|
||||
#define TEST_GROUPCALLS_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class GroupCallsCorrectnessTest : public CorrectnessTest {};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,92 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include "test_GroupCallsMultiProcess.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(GroupCallsMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
// Important: Make sure the order of ncclFunc_t's here match the order of ncclFunc_ts
|
||||
// as they appear in TestGroupCalls()
|
||||
std::vector<ncclFunc_t> ncclFuncs;
|
||||
ncclFuncs.push_back(ncclCollAllGather);
|
||||
ncclFuncs.push_back(ncclCollAllReduce);
|
||||
ncclFuncs.push_back(ncclCollBroadcast);
|
||||
ncclFuncs.push_back(ncclCollReduce);
|
||||
ncclFuncs.push_back(ncclCollReduceScatter);
|
||||
|
||||
// Create multiple datasets for combined operation
|
||||
std::vector<Dataset*> datasets(ncclFuncs.size());
|
||||
for (int i = 0; i < datasets.size(); i++)
|
||||
{
|
||||
datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
|
||||
datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
|
||||
}
|
||||
|
||||
int const numGpusPerProcess = 2;
|
||||
int const numProcesses = numDevices / numGpusPerProcess;
|
||||
std::vector<int> pids(numProcesses);
|
||||
int process = -1;
|
||||
|
||||
for (int i = 0; i < numDevices; i+= numGpusPerProcess)
|
||||
{
|
||||
process++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
int gpuIdx = i;
|
||||
int maxIdx = gpuIdx + (numGpusPerProcess - 1) >= numDevices ? numDevices : gpuIdx + numGpusPerProcess;
|
||||
|
||||
std::vector<int> ranks;
|
||||
for (; gpuIdx < maxIdx; gpuIdx++)
|
||||
{
|
||||
ranks.push_back(gpuIdx);
|
||||
}
|
||||
|
||||
bool pass;
|
||||
TestGroupCalls(process, ranks, datasets, ncclFuncs, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
pids[process] = pid;
|
||||
}
|
||||
}
|
||||
|
||||
ValidateProcesses(pids);
|
||||
|
||||
for (int i = 0; i < datasets.size(); i++)
|
||||
{
|
||||
datasets[i]->ReleaseRootProcess();
|
||||
munmap(datasets[i], sizeof(Dataset));
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(GroupCallsMultiProcessCorrectnessSweep,
|
||||
GroupCallsMultiProcessCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator (not used)
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(3072, 3145728),
|
||||
// Number of devices
|
||||
testing::Values(4,8),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,148 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef TEST_GROUPCALLS_MULTI_PROCESS_HPP
|
||||
#define TEST_GROUPCALLS_MULTI_PROCESS_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
#include "test_AllGatherMultiProcess.hpp"
|
||||
#include "test_AllReduceMultiProcess.hpp"
|
||||
#include "test_BroadcastMultiProcess.hpp"
|
||||
#include "test_ReduceMultiProcess.hpp"
|
||||
#include "test_ReduceScatterMultiProcess.hpp"
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class GroupCallsMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
void TestGroupCalls(int process, std::vector<int> const& ranks, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs, bool& pass)
|
||||
{
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
SetUpPerProcess(ranks[i], funcs, comms[ranks[i]], streams[ranks[i]], datasets);
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
int numProcesses = numDevices / ranks.size();
|
||||
Barrier barrier(process, numProcesses, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
|
||||
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
for (int j = 0; j < datasets.size(); j++)
|
||||
{
|
||||
FillDatasetWithPattern(*datasets[j], ranks[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int const root = 0;
|
||||
|
||||
AllGatherMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[0], barrier, numDevices, ranks);
|
||||
AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[1], barrier, op, ranks);
|
||||
BroadcastMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[2], root, ranks);
|
||||
ReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[3], barrier, op, root, ranks);
|
||||
ReduceScatterMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[4], barrier, op, ranks);
|
||||
barrier.Wait();
|
||||
|
||||
size_t const byteCount = datasets[0]->NumBytes() / numDevices;
|
||||
size_t const elemCount = numElements / numDevices;
|
||||
|
||||
ncclGroupStart();
|
||||
// AllGather
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
ncclAllGather((int8_t *)datasets[0]->inputs[rank] + (rank * byteCount),
|
||||
datasets[0]->outputs[rank], elemCount,
|
||||
dataType, comms[rank], streams[rank]);
|
||||
}
|
||||
|
||||
// AllReduce
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
ncclAllReduce(datasets[1]->inputs[rank], datasets[1]->outputs[rank],
|
||||
numElements, dataType, op, comms[rank], streams[rank]);
|
||||
}
|
||||
|
||||
// Broadcast
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
ncclBroadcast(datasets[2]->inputs[rank],
|
||||
datasets[2]->outputs[rank],
|
||||
numElements, dataType,
|
||||
root, comms[rank], streams[rank]);
|
||||
}
|
||||
|
||||
// Reduce
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
ncclReduce(datasets[3]->inputs[rank],
|
||||
datasets[3]->outputs[rank],
|
||||
numElements, dataType, op,
|
||||
root, comms[rank], streams[rank]);
|
||||
}
|
||||
|
||||
// ReduceScatter
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
ncclReduceScatter(datasets[4]->inputs[rank],
|
||||
(int8_t *)datasets[4]->outputs[rank] + (rank * byteCount),
|
||||
elemCount, dataType, op,
|
||||
comms[rank], streams[rank]);
|
||||
}
|
||||
|
||||
// Signal end of group call
|
||||
ncclGroupEnd();
|
||||
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
HIP_CALL(hipSetDevice(ranks[i]));
|
||||
HIP_CALL(hipStreamSynchronize(streams[ranks[i]]));
|
||||
}
|
||||
|
||||
for (int i = 0; i < funcs.size(); i++)
|
||||
{
|
||||
for (int j = 0; j < ranks.size(); j++)
|
||||
{
|
||||
pass = ValidateResults(*datasets[i], ranks[j], root);
|
||||
if (!pass)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
barrier.Wait();
|
||||
for (int j = 0; j < ranks.size(); j++)
|
||||
{
|
||||
datasets[i]->Release(ranks[j]);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
TearDownPerProcess(comms[ranks[i]], streams[ranks[i]]);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,71 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_Reduce.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(ReduceCorrectnessTest, Correctness)
|
||||
{
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
|
||||
// Allocate data
|
||||
Dataset dataset;
|
||||
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollReduce);
|
||||
|
||||
// Test each possible root
|
||||
for (int root = 0; root < numDevices; root++)
|
||||
{
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset);
|
||||
ComputeExpectedResults(dataset, op, root);
|
||||
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclReduce(dataset.inputs[i],
|
||||
dataset.outputs[i],
|
||||
numElements, dataType, op,
|
||||
root, comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset);
|
||||
}
|
||||
|
||||
dataset.Release();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(ReduceCorrectnessSweep,
|
||||
ReduceCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator
|
||||
testing::Values(ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,87 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_REDUCE_HPP
|
||||
#define TEST_REDUCE_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class ReduceCorrectnessTest : public CorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op, int const root)
|
||||
{
|
||||
// Copy all inputs to expected arrays temporarily to perform reduction on host
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
|
||||
// Allocate temporary host array to accumulate results
|
||||
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
|
||||
uint8_t* resultU1 = (uint8_t *)resultI1;
|
||||
int32_t* resultI4 = (int32_t *)resultI1;
|
||||
uint32_t* resultU4 = (uint32_t *)resultI1;
|
||||
int64_t* resultI8 = (int64_t *)resultI1;
|
||||
uint64_t* resultU8 = (uint64_t *)resultI1;
|
||||
float* resultF4 = (float *)resultI1;
|
||||
double* resultF8 = (double *)resultI1;
|
||||
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
|
||||
|
||||
// Initialize the result with the first device's array
|
||||
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
|
||||
ncclRedOp_t red_op = ((op == ncclAvg) ? ncclSum : op);
|
||||
|
||||
// Perform reduction on the other device arrays
|
||||
for (int i = 1; i < dataset.numDevices; i++)
|
||||
{
|
||||
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
|
||||
uint8_t* arrayU1 = (uint8_t *)arrayI1;
|
||||
int32_t* arrayI4 = (int32_t *)arrayI1;
|
||||
uint32_t* arrayU4 = (uint32_t *)arrayI1;
|
||||
int64_t* arrayI8 = (int64_t *)arrayI1;
|
||||
uint64_t* arrayU8 = (uint64_t *)arrayI1;
|
||||
float* arrayF4 = (float *)arrayI1;
|
||||
double* arrayF8 = (double *)arrayI1;
|
||||
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
|
||||
|
||||
for (int j = 0; j < dataset.numElements; j++)
|
||||
{
|
||||
switch (dataset.dataType)
|
||||
{
|
||||
case ncclInt8: resultI1[j] = ReduceOp(red_op, resultI1[j], arrayI1[j]); break;
|
||||
case ncclUint8: resultU1[j] = ReduceOp(red_op, resultU1[j], arrayU1[j]); break;
|
||||
case ncclInt32: resultI4[j] = ReduceOp(red_op, resultI4[j], arrayI4[j]); break;
|
||||
case ncclUint32: resultU4[j] = ReduceOp(red_op, resultU4[j], arrayU4[j]); break;
|
||||
case ncclInt64: resultI8[j] = ReduceOp(red_op, resultI8[j], arrayI8[j]); break;
|
||||
case ncclUint64: resultU8[j] = ReduceOp(red_op, resultU8[j], arrayU8[j]); break;
|
||||
case ncclFloat32: resultF4[j] = ReduceOp(red_op, resultF4[j], arrayF4[j]); break;
|
||||
case ncclFloat64: resultF8[j] = ReduceOp(red_op, resultF8[j], arrayF8[j]); break;
|
||||
case ncclBfloat16: resultB2[j] = ReduceOp(red_op, resultB2[j], arrayB2[j]); break;
|
||||
default:
|
||||
fprintf(stderr, "[ERROR] Unsupported datatype\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (op == ncclAvg)
|
||||
Average(dataset, resultI1);
|
||||
|
||||
// Copy results into expected arrays
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
{
|
||||
if (i == root)
|
||||
memcpy(dataset.expected[root], resultI1, dataset.NumBytes());
|
||||
else
|
||||
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.outputs[i], dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
free(resultI1);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,61 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_ReduceMultiProcess.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(ReduceMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollReduce);
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
bool pass;
|
||||
TestReduce(gpu, *dataset, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
}
|
||||
|
||||
ValidateProcesses(pids);
|
||||
dataset->ReleaseRootProcess();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(ReduceMultiProcessCorrectnessSweep,
|
||||
ReduceMultiProcessCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator
|
||||
testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,131 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_REDUCE_MULTI_PROCESS_HPP
|
||||
#define TEST_REDUCE_MULTI_PROCESS_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class ReduceMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, int const root, std::vector<int> const& ranks)
|
||||
{
|
||||
// Copy all inputs to expected arrays temporarily to perform reduction on host
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
barrier.Wait();
|
||||
|
||||
for (int h = 0; h < ranks.size(); h++)
|
||||
{
|
||||
int rank = ranks[h];
|
||||
if (rank == root)
|
||||
{
|
||||
// Allocate temporary host array to accumulate results
|
||||
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
|
||||
uint8_t* resultU1 = (uint8_t *)resultI1;
|
||||
int32_t* resultI4 = (int32_t *)resultI1;
|
||||
uint32_t* resultU4 = (uint32_t *)resultI1;
|
||||
int64_t* resultI8 = (int64_t *)resultI1;
|
||||
uint64_t* resultU8 = (uint64_t *)resultI1;
|
||||
float* resultF4 = (float *)resultI1;
|
||||
double* resultF8 = (double *)resultI1;
|
||||
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
|
||||
|
||||
// Initialize the result with the first device's array
|
||||
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
|
||||
|
||||
// Perform reduction on the other device arrays
|
||||
for (int i = 1; i < dataset.numDevices; i++)
|
||||
{
|
||||
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
|
||||
uint8_t* arrayU1 = (uint8_t *)arrayI1;
|
||||
int32_t* arrayI4 = (int32_t *)arrayI1;
|
||||
uint32_t* arrayU4 = (uint32_t *)arrayI1;
|
||||
int64_t* arrayI8 = (int64_t *)arrayI1;
|
||||
uint64_t* arrayU8 = (uint64_t *)arrayI1;
|
||||
float* arrayF4 = (float *)arrayI1;
|
||||
double* arrayF8 = (double *)arrayI1;
|
||||
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
|
||||
|
||||
for (int j = 0; j < dataset.numElements; j++)
|
||||
{
|
||||
switch (dataset.dataType)
|
||||
{
|
||||
case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
|
||||
case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
|
||||
case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
|
||||
case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
|
||||
case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
|
||||
case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
|
||||
case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
|
||||
case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
|
||||
case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
|
||||
default:
|
||||
fprintf(stderr, "[ERROR] Unsupported datatype\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
memcpy(dataset.expected[root], resultI1, dataset.NumBytes());
|
||||
free(resultI1);
|
||||
}
|
||||
}
|
||||
barrier.Wait();
|
||||
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
if (rank != root)
|
||||
{
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.outputs[rank], dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestReduce(int rank, Dataset& dataset, bool& pass)
|
||||
{
|
||||
SetUpPerProcess(rank, ncclCollReduce, comms[rank], streams[rank], dataset);
|
||||
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
|
||||
|
||||
// Test each possible root
|
||||
for (int root = 0; root < numDevices; root++)
|
||||
{
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset, rank);
|
||||
ComputeExpectedResults(dataset, barrier, op, root, std::vector<int>(1, rank));
|
||||
// Launch the reduction (1 process per GPU)
|
||||
ncclResult_t res = ncclReduce(dataset.inputs[rank],
|
||||
dataset.outputs[rank],
|
||||
numElements, dataType, op,
|
||||
root, comms[rank], streams[rank]);
|
||||
// Wait for reduction to complete
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
// Check results
|
||||
pass = ValidateResults(dataset, rank);
|
||||
// Ensure all processes have finished current iteration before proceeding
|
||||
barrier.Wait();
|
||||
}
|
||||
|
||||
TearDownPerProcess(comms[rank], streams[rank]);
|
||||
dataset.Release(rank);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,71 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_ReduceScatter.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(ReduceScatterCorrectnessTest, Correctness)
|
||||
{
|
||||
// Adjust numElements to be multiple of numDevices
|
||||
numElements = (numElements/numDevices)*numDevices;
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
if (numElements % numDevices != 0) return;
|
||||
|
||||
// Prepare input / output / expected results
|
||||
Dataset dataset;
|
||||
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollReduceScatter);
|
||||
FillDatasetWithPattern(dataset);
|
||||
ComputeExpectedResults(dataset, op);
|
||||
|
||||
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
|
||||
size_t const recvCount = dataset.numElements / dataset.numDevices;
|
||||
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclReduceScatter(dataset.inputs[i],
|
||||
(int8_t *)dataset.outputs[i] + (i * byteCount),
|
||||
recvCount, dataType, op,
|
||||
comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset);
|
||||
|
||||
dataset.Release();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(ReduceScatterCorrectnessSweep,
|
||||
ReduceScatterCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator
|
||||
testing::Values(ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(2520, 3026520),
|
||||
// Number of devices
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,90 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_REDUCE_SCATTER_HPP
|
||||
#define TEST_REDUCE_SCATTER_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class ReduceScatterCorrectnessTest : public CorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op)
|
||||
{
|
||||
// Copy all inputs to expected arrays temporarily to perform reduction on host
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
|
||||
// Allocate temporary host array to accumulate results
|
||||
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
|
||||
uint8_t* resultU1 = (uint8_t *)resultI1;
|
||||
int32_t* resultI4 = (int32_t *)resultI1;
|
||||
uint32_t* resultU4 = (uint32_t *)resultI1;
|
||||
int64_t* resultI8 = (int64_t *)resultI1;
|
||||
uint64_t* resultU8 = (uint64_t *)resultI1;
|
||||
float* resultF4 = (float *)resultI1;
|
||||
double* resultF8 = (double *)resultI1;
|
||||
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
|
||||
|
||||
// Initialize the result with the first device's array
|
||||
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
|
||||
ncclRedOp_t red_op = ((op == ncclAvg) ? ncclSum : op);
|
||||
|
||||
// Perform reduction on the other device arrays
|
||||
for (int i = 1; i < dataset.numDevices; i++)
|
||||
{
|
||||
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
|
||||
uint8_t* arrayU1 = (uint8_t *)arrayI1;
|
||||
int32_t* arrayI4 = (int32_t *)arrayI1;
|
||||
uint32_t* arrayU4 = (uint32_t *)arrayI1;
|
||||
int64_t* arrayI8 = (int64_t *)arrayI1;
|
||||
uint64_t* arrayU8 = (uint64_t *)arrayI1;
|
||||
float* arrayF4 = (float *)arrayI1;
|
||||
double* arrayF8 = (double *)arrayI1;
|
||||
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
|
||||
|
||||
for (int j = 0; j < dataset.numElements; j++)
|
||||
{
|
||||
switch (dataset.dataType)
|
||||
{
|
||||
case ncclInt8: resultI1[j] = ReduceOp(red_op, resultI1[j], arrayI1[j]); break;
|
||||
case ncclUint8: resultU1[j] = ReduceOp(red_op, resultU1[j], arrayU1[j]); break;
|
||||
case ncclInt32: resultI4[j] = ReduceOp(red_op, resultI4[j], arrayI4[j]); break;
|
||||
case ncclUint32: resultU4[j] = ReduceOp(red_op, resultU4[j], arrayU4[j]); break;
|
||||
case ncclInt64: resultI8[j] = ReduceOp(red_op, resultI8[j], arrayI8[j]); break;
|
||||
case ncclUint64: resultU8[j] = ReduceOp(red_op, resultU8[j], arrayU8[j]); break;
|
||||
case ncclFloat32: resultF4[j] = ReduceOp(red_op, resultF4[j], arrayF4[j]); break;
|
||||
case ncclFloat64: resultF8[j] = ReduceOp(red_op, resultF8[j], arrayF8[j]); break;
|
||||
case ncclBfloat16: resultB2[j] = ReduceOp(red_op, resultB2[j], arrayB2[j]); break;
|
||||
default:
|
||||
fprintf(stderr, "[ERROR] Unsupported datatype\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (op == ncclAvg)
|
||||
Average(dataset, resultI1);
|
||||
|
||||
// Copy results into expected arrays
|
||||
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
|
||||
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.outputs[i],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
memcpy((int8_t *)dataset.expected[i] + (i * byteCount),
|
||||
resultI1 + (i * byteCount), byteCount);
|
||||
|
||||
free(resultI1);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,61 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_ReduceScatterMultiProcess.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(ReduceScatterMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollReduceScatter);
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
bool pass;
|
||||
TestReduceScatter(gpu, *dataset, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
}
|
||||
|
||||
ValidateProcesses(pids);
|
||||
dataset->ReleaseRootProcess();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(ReduceScatterMultiProcessCorrectnessSweep,
|
||||
ReduceScatterMultiProcessCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator
|
||||
testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(3072, 3145728),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,146 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_REDUCE_SCATTER_MULTI_PROCESS_HPP
|
||||
#define TEST_REDUCE_SCATTER_MULTI_PROCESS_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class ReduceScatterMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, std::vector<int> const& ranks)
|
||||
{
|
||||
// Copy all inputs to expected arrays temporarily to perform reduction on host
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
barrier.Wait();
|
||||
|
||||
// Have rank 0 do the expected calculation, then send results to other processes
|
||||
int8_t* resultI1;
|
||||
for (int h = 0; h < ranks.size(); h++)
|
||||
{
|
||||
int rank = ranks[h];
|
||||
if (rank == 0)
|
||||
{
|
||||
// Allocate temporary host array to accumulate results
|
||||
resultI1 = (int8_t *)malloc(dataset.NumBytes());
|
||||
uint8_t* resultU1 = (uint8_t *)resultI1;
|
||||
int32_t* resultI4 = (int32_t *)resultI1;
|
||||
uint32_t* resultU4 = (uint32_t *)resultI1;
|
||||
int64_t* resultI8 = (int64_t *)resultI1;
|
||||
uint64_t* resultU8 = (uint64_t *)resultI1;
|
||||
float* resultF4 = (float *)resultI1;
|
||||
double* resultF8 = (double *)resultI1;
|
||||
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
|
||||
|
||||
// Initialize the result with the first device's array
|
||||
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
|
||||
|
||||
// Perform reduction on the other device arrays
|
||||
for (int i = 1; i < dataset.numDevices; i++)
|
||||
{
|
||||
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
|
||||
uint8_t* arrayU1 = (uint8_t *)arrayI1;
|
||||
int32_t* arrayI4 = (int32_t *)arrayI1;
|
||||
uint32_t* arrayU4 = (uint32_t *)arrayI1;
|
||||
int64_t* arrayI8 = (int64_t *)arrayI1;
|
||||
uint64_t* arrayU8 = (uint64_t *)arrayI1;
|
||||
float* arrayF4 = (float *)arrayI1;
|
||||
double* arrayF8 = (double *)arrayI1;
|
||||
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
|
||||
|
||||
for (int j = 0; j < dataset.numElements; j++)
|
||||
{
|
||||
switch (dataset.dataType)
|
||||
{
|
||||
case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
|
||||
case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
|
||||
case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
|
||||
case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
|
||||
case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
|
||||
case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
|
||||
case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
|
||||
case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
|
||||
case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
|
||||
default:
|
||||
fprintf(stderr, "[ERROR] Unsupported datatype\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier.Wait();
|
||||
// Copy results into expected arrays
|
||||
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
|
||||
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.outputs[rank],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
barrier.Wait();
|
||||
|
||||
for (int h = 0; h < ranks.size(); h++)
|
||||
{
|
||||
int rank = ranks[h];
|
||||
if (rank == 0)
|
||||
{
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
memcpy((int8_t *)dataset.expected[i] + (i * byteCount),
|
||||
resultI1 + (i * byteCount), byteCount);
|
||||
|
||||
free(resultI1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestReduceScatter(int rank, Dataset& dataset, bool& pass)
|
||||
{
|
||||
// Prepare input / output / expected results
|
||||
SetUpPerProcess(rank, ncclCollAllGather, comms[rank], streams[rank], dataset);
|
||||
|
||||
if (numDevices > numDevicesAvailable || numElements % numDevices != 0)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
|
||||
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset, rank);
|
||||
ComputeExpectedResults(dataset, barrier, op, std::vector<int>(1, rank));
|
||||
|
||||
size_t const byteCount = dataset.NumBytes() / numDevices;
|
||||
size_t const recvCount = dataset.numElements / numDevices;
|
||||
|
||||
// Launch the reduction (1 process per GPU)
|
||||
ncclReduceScatter(dataset.inputs[rank],
|
||||
(int8_t *)dataset.outputs[rank] + (rank * byteCount),
|
||||
recvCount, dataType, op,
|
||||
comms[rank], streams[rank]);
|
||||
|
||||
// Wait for reduction to complete
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
|
||||
// Check results
|
||||
pass = ValidateResults(dataset, rank);
|
||||
|
||||
TearDownPerProcess(comms[rank], streams[rank]);
|
||||
dataset.Release(rank);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,71 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_Scatter.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(ScatterCorrectnessTest, Correctness)
|
||||
{
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
|
||||
// Allocate data
|
||||
Dataset dataset;
|
||||
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollScatter);
|
||||
|
||||
// Test each possible root
|
||||
for (int root = 0; root < numDevices; root++)
|
||||
{
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset);
|
||||
ComputeExpectedResults(dataset, root);
|
||||
|
||||
// Launch the reduction (1 thread per GPU)
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
ncclScatter(dataset.inputs[i],
|
||||
dataset.outputs[i],
|
||||
numElements, dataType,
|
||||
root, comms[i], streams[i]);
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
// Wait for reduction to complete
|
||||
Synchronize();
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset);
|
||||
}
|
||||
|
||||
dataset.Release();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(ScatterCorrectnessSweep,
|
||||
ScatterCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator is not used
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Range(2,(GTESTS_NUM_GPUS+1)),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,25 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_SCATTER_HPP
|
||||
#define TEST_SCATTER_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class ScatterCorrectnessTest : public CorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, int const root)
|
||||
{
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
HIP_CALL(hipMemcpy(dataset.expected[i], (int8_t *)dataset.inputs[root]+dataset.NumBytes()*i,
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,61 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "test_ScatterMultiProcess.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(ScatterMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollScatter);
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
bool pass;
|
||||
TestScatter(gpu, *dataset, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
}
|
||||
|
||||
ValidateProcesses(pids);
|
||||
dataset->ReleaseRootProcess();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(ScatterMultiProcessCorrectnessSweep,
|
||||
ScatterMultiProcessCorrectnessTest,
|
||||
testing::Combine(
|
||||
// Reduction operator is not used
|
||||
testing::Values(ncclSum),
|
||||
// Data types
|
||||
testing::Values(ncclInt8,
|
||||
ncclUint8,
|
||||
ncclInt32,
|
||||
ncclUint32,
|
||||
ncclInt64,
|
||||
ncclUint64,
|
||||
//ncclFloat16,
|
||||
ncclFloat32,
|
||||
ncclFloat64,
|
||||
ncclBfloat16),
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
@@ -1,68 +0,0 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef TEST_SCATTER_MULTI_PROCESS_HPP
|
||||
#define TEST_SCATTER_MULTI_PROCESS_HPP
|
||||
|
||||
#include "CorrectnessTest.hpp"
|
||||
|
||||
namespace CorrectnessTests
|
||||
{
|
||||
class ScatterMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, int const root, int const rank)
|
||||
{
|
||||
if (rank == root)
|
||||
{
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
HIP_CALL(hipMemcpy(dataset.expected[i], (int8_t *)dataset.inputs[root]+dataset.NumBytes()*i,
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
}
|
||||
|
||||
void TestScatter(int rank, Dataset& dataset, bool& pass)
|
||||
{
|
||||
// Prepare input / output / expected results
|
||||
SetUpPerProcess(rank, ncclCollScatter, comms[rank], streams[rank], dataset);
|
||||
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
|
||||
|
||||
// Test each possible root
|
||||
for (int root = 0; root < numDevices; root++)
|
||||
{
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset, rank);
|
||||
|
||||
ComputeExpectedResults(dataset, root, rank);
|
||||
|
||||
// Launch the reduction (1 process per GPU)
|
||||
ncclScatter(dataset.inputs[rank],
|
||||
dataset.outputs[rank],
|
||||
numElements, dataType,
|
||||
root, comms[rank], streams[rank]);
|
||||
|
||||
// Wait for reduction to complete
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
|
||||
// Check results
|
||||
pass = ValidateResults(dataset, rank);
|
||||
|
||||
barrier.Wait();
|
||||
}
|
||||
|
||||
TearDownPerProcess(comms[rank], streams[rank]);
|
||||
dataset.Release(rank);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
Viittaa uudesa ongelmassa
Block a user