Unit test refactor (#500)

Refactoring and consolidating single-process / multi-process unit testing
This commit is contained in:
gilbertlee-amd
2022-02-25 08:59:07 -07:00
committed by GitHub
vanhempi b569c0a1db
commit 29ad0f5fbe
98 muutettua tiedostoa jossa 3684 lisäystä ja 5094 poistoa
+1 -1
Näytä tiedosto
@@ -21,7 +21,7 @@ def runTestCommand (platform, project, gfilter)
def command = """#!/usr/bin/env bash
set -x
cd ${project.paths.project_build_prefix}/build/release/test
${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
${sudo} UT_SHOW_NAMES=1 HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
"""
platform.runCommand(this, command)
+10 -10
Näytä tiedosto
@@ -9,12 +9,12 @@ import com.amd.project.*
import com.amd.docker.*
import java.nio.file.Path
def runCI =
def runCI =
{
nodeDetails, jobName->
def prj = new rocProject('rccl', 'PreCheckin')
prj.timeout.test = 1440
prj.paths.build_command = './install.sh -t '
@@ -32,25 +32,25 @@ def runCI =
commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
commonGroovy.runCompileCommand(platform, project, jobName)
}
def testCommand =
{
platform, project->
commonGroovy.runTestCommand(platform, project, "*sum_float32*")
commonGroovy.runTestCommand(platform, project, "*")
}
def packageCommand =
{
platform, project->
commonGroovy.runPackageCommand(platform, project, jobName)
}
buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
}
ci: {
ci: {
String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
@@ -58,17 +58,17 @@ ci: {
propertyList = auxiliary.appendPropertyList(propertyList)
def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([sles15sp1:['4gfx906'],centos8:['8gfx908'],centos7:['8gfx906'],ubuntu18:['4gfx906', '4gfx908']])]
jobNameList = auxiliary.appendJobNameList(jobNameList)
propertyList.each
propertyList.each
{
jobName, property->
if (urlJobName == jobName)
properties(auxiliary.addCommonProperties(property))
}
jobNameList.each
jobNameList.each
{
jobName, nodeDetails->
if (urlJobName == jobName)
+1 -1
Näytä tiedosto
@@ -229,7 +229,7 @@ if ($run_tests); then
if ($run_tests_all); then
./test/UnitTests
else
./test/UnitTests --gtest_filter="BroadcastCorrectnessSweep*:*float32*"
./test/UnitTests --gtest_filter="AllReduce.*"
fi
else
echo "Unit tests have not been built yet; please re-run script with -t to build unit tests."
+15 -1
Näytä tiedosto
@@ -96,6 +96,7 @@ void CliqueManager::CleanUp()
if (m_cliqueMode == CLIQUE_SINGLE_NODE)
{
// Release caches
INFO(NCCL_COLL, "Rank %d deleting IPC caches", m_rank);
if (m_ipcHandleSendCache) delete m_ipcHandleSendCache;
if (m_ipcHandleRecvCache) delete m_ipcHandleRecvCache;
@@ -494,18 +495,24 @@ ncclResult_t CliqueManager::CheckCacheForPtr(void* devPtr,
uint64_t realAddr = (uint64_t)devPtr;
handlePair->second = realAddr - baseAddr;
CUDACHECK(hipIpcGetMemHandle(&handlePair->first, (void*)baseAddr));
/* Disabling cache until proper deallocation methods are available
// IPC handles are only supported for base address pointers
NcclIpcHandleSendCache::iterator it = cache->find(baseAddr);
if (it == cache->end())
{
INFO(NCCL_COLL, "Rank %d searching IPC handle cache for %p (not found)", rank, devPtr);
CUDACHECK(hipIpcGetMemHandle(&handlePair->first, (void*)baseAddr));
cache->insert(baseAddr, handlePair->first);
}
else
{
INFO(NCCL_COLL, "Rank %d searching IPC handle cache for %p (found!)", rank, devPtr);
handlePair->first = (it->second).first;
}
*/
return ncclSuccess;
}
@@ -513,10 +520,16 @@ ncclResult_t CliqueManager::CheckCacheForHandle(std::pair<hipIpcMemHandle_t, siz
NcclIpcHandleRecvCache* cache,
void** ptr)
{
// Until proper deallocation hooks are implemented, receive cache can not be used
// Handles will need to be extract each time
void* baseAddr;
CUDACHECK(hipIpcOpenMemHandle(&baseAddr, handlePair.first, hipIpcMemLazyEnablePeerAccess));
/*
NcclIpcHandleRecvCache::iterator it = cache->find(handlePair.first);
// Get base address pointer from cache if it exists
void* baseAddr;
if (it == cache->end())
{
CUDACHECK(hipIpcOpenMemHandle(&baseAddr, handlePair.first, hipIpcMemLazyEnablePeerAccess));
@@ -526,6 +539,7 @@ ncclResult_t CliqueManager::CheckCacheForHandle(std::pair<hipIpcMemHandle_t, siz
{
baseAddr = (it->second).first;
}
*/
// Modify base address pointer with offset
uint64_t realAddr = (uint64_t)baseAddr + handlePair.second;
+7 -7
Näytä tiedosto
@@ -79,14 +79,14 @@ public:
static ncclResult_t BootstrapRootInit(int pid, unsigned long hash);
protected:
static ncclResult_t CheckCacheForPtr(void* devPtr,
NcclIpcHandleSendCache* cache,
int rank,
std::pair<hipIpcMemHandle_t, size_t>* handlePair);
ncclResult_t CheckCacheForPtr(void* devPtr,
NcclIpcHandleSendCache* cache,
int rank,
std::pair<hipIpcMemHandle_t, size_t>* handlePair);
static ncclResult_t CheckCacheForHandle(std::pair<hipIpcMemHandle_t, size_t> const& handlePair,
NcclIpcHandleRecvCache* cache,
void** ptr);
ncclResult_t CheckCacheForHandle(std::pair<hipIpcMemHandle_t, size_t> const& handlePair,
NcclIpcHandleRecvCache* cache,
void** ptr);
int m_rank; // Associated rank
int m_numRanks; // Total number of ranks
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(AllGather, InPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllGather};
std::vector<ncclDataType_t> const dataTypes = {ncclInt8, ncclInt32, ncclInt64};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {true};
std::vector<bool> const managedMemList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(AllGather, ManagedMem)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllGather};
std::vector<ncclDataType_t> const dataTypes = {ncclUint8, ncclUint32, ncclUint64};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {true};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(AllGather, OutOfPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllGather};
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+31
Näytä tiedosto
@@ -0,0 +1,31 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
#include <cstdlib>
namespace RcclUnitTesting
{
TEST(AllReduce, Clique)
{
// Set clique env var prior to TestBed
setenv("RCCL_ENABLE_CLIQUE", "1", 1);
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
std::vector<ncclDataType_t> const dataTypes = testBed.GetAllSupportedDataTypes();
std::vector<ncclRedOp_t> const redOps = testBed.GetAllSupportedRedOps();
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false, true};
std::vector<bool> const managedMemList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
unsetenv("RCCL_ENABLE_CLIQUE");
}
}
+63
Näytä tiedosto
@@ -0,0 +1,63 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(AllReduce, GroupCall)
{
TestBed testBed;
// Configuration
ncclFunc_t const funcType = ncclCollAllReduce;
std::vector<ncclDataType_t> const& dataTypes = {ncclFloat};
std::vector<ncclRedOp_t> const& redOps = {ncclSum};
std::vector<int> const numElements = {1048576, 53327, 1024};
int const root = 0;
bool const inPlace = false;
bool const useManagedMem = false;
int const numCollPerGroup = numElements.size();
// This tests runs 3 collectives in the same group call
bool isCorrect = true;
for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
{
// Test either single process all GPUs, or 1 process per GPU
int const numProcesses = isMultiProcess ? totalRanks : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
for (int redOpIdx = 0; redOpIdx < redOps.size() && isCorrect; ++redOpIdx)
for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
{
if (testBed.ev.showNames)
INFO("%s process %2d-ranks AllReduce %d Grouped Calls (%s-%s)\n",
isMultiProcess ? "Multi " : "Single",
totalRanks, numCollPerGroup,
ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]);
// Run all element sizes in parallel as single group
for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx)
{
testBed.SetCollectiveArgs(funcType,
dataTypes[dataIdx],
redOps[redOpIdx],
root,
numElements[collIdx],
numElements[collIdx],
collIdx);
}
testBed.AllocateMem(inPlace, useManagedMem);
testBed.PrepareData();
testBed.ExecuteCollectives();
testBed.ValidateResults(isCorrect);
testBed.DeallocateMem();
}
testBed.DestroyComms();
}
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(AllReduce, InPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
std::vector<ncclDataType_t> const dataTypes = {ncclInt8, ncclInt32, ncclFloat32};
std::vector<ncclRedOp_t> const redOps = {ncclSum, ncclProd};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {true};
std::vector<bool> const managedMemList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(AllReduce, ManagedMem)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclUint8, ncclUint64};
std::vector<ncclRedOp_t> const redOps = {ncclSum, ncclMax};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {true};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(AllReduce, OutOfPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16};
std::vector<ncclRedOp_t> const redOps = {ncclSum, ncclMin};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+74
Näytä tiedosto
@@ -0,0 +1,74 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
// This tests using custom pre-mult scalars reductions
TEST(AllReduce, PreMultScalar)
{
TestBed testBed;
// Configuration
ncclFunc_t const funcType = ncclCollAllReduce;
std::vector<ncclDataType_t> const& dataTypes = {ncclInt32, ncclFloat32, ncclFloat64};
ncclRedOp_t const redOp = ncclSum;
std::vector<int> const numElements = {1048576, 1024};
int const root = 0;
bool const inPlace = false;
bool const useManagedMem = false;
// Terminate the test as soon as first failure occurs
bool isCorrect = true;
for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
for (int isMultiProcess = 0; isMultiProcess <= 1; ++isMultiProcess)
{
int const numProcesses = isMultiProcess ? totalRanks : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
{
ncclDataType_t const dataType = dataTypes[dataIdx];
// Set scalars per rank
PtrUnion scalarsPerRank;
scalarsPerRank.AllocateCpuMem(totalRanks * DataTypeToBytes(dataType));
for (int i = 0; i < totalRanks; i++)
{
double F = i;
scalarsPerRank.Set(dataType, i, i, F);
}
// Test various scalar residence modes
for (int scalarMode = 0; scalarMode <= 1 && isCorrect; ++scalarMode)
{
if (testBed.ev.showNames)
INFO("%s process %2d-ranks AllReduce (custom-scalar Mode %d %s)\n",
isMultiProcess ? "Multi " : "Single",
totalRanks, scalarMode, ncclDataTypeNames[dataType]);
for (int i = 0; i < numElements.size() && isCorrect; ++i)
{
testBed.SetCollectiveArgs(funcType, dataType, redOp, root,
numElements[i], numElements[i],
-1, -1, scalarsPerRank, scalarMode);
// For performance, only allocate and prepare data on largest size
if (i == 0)
{
testBed.AllocateMem(inPlace, useManagedMem);
testBed.PrepareData();
}
testBed.ExecuteCollectives();
testBed.ValidateResults(isCorrect);
}
testBed.DeallocateMem();
}
}
testBed.DestroyComms();
}
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(AllToAll, ManagedMem)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllToAll};
std::vector<ncclDataType_t> const dataTypes = {ncclUint8, ncclUint32, ncclUint64};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {true};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(AllToAll, OutOfPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllToAll};
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(Broadcast, InPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollBroadcast};
std::vector<ncclDataType_t> const dataTypes = {ncclInt8, ncclInt32, ncclInt64};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {true};
std::vector<bool> const managedMemList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(Broadcast, ManagedMem)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollBroadcast};
std::vector<ncclDataType_t> const dataTypes = {ncclUint8, ncclUint32, ncclUint64};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {true};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(Broadcast, OutOfPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollBroadcast};
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {1};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+76 -62
Näytä tiedosto
@@ -3,87 +3,103 @@ cmake_minimum_required(VERSION 2.8.12)
if(BUILD_TESTS)
message("Going to build unit tests (Installed in /test/UnitTests)")
message("Building unit tests (Installed in /test/UnitTests)")
find_program(CHRPATH chrpath)
if(NOT CHRPATH)
message(FATAL_ERROR "chrpath is required for UnitTests. Please install (e.g. sudo apt-get install chrpath)")
endif()
include_directories(${GTEST_INCLUDE_DIRS})
find_package(hsa-runtime64 PATHS /opt/rocm )
if(${hsa-runtime64_FOUND})
message("hsa-runtime64 found @ ${hsa-runtime64_DIR} ")
else()
message("find_package did NOT find hsa-runtime64, finding it the OLD Way")
message("Looking for header files in ${ROCR_INC_DIR}")
message("Looking for library files in ${ROCR_LIB_DIR}")
# Search for ROCr header file in user defined locations
find_path(ROCR_HDR hsa.h PATHS ${ROCR_INC_DIR} "/opt/rocm" PATH_SUFFIXES include/hsa REQUIRED)
INCLUDE_DIRECTORIES(${ROCR_HDR})
# Search for ROCr library file in user defined locations
find_library(ROCR_LIB ${CORE_RUNTIME_TARGET} PATHS ${ROCR_LIB_DIR} "/opt/rocm" PATH_SUFFIXES lib lib64 REQUIRED)
endif()
include_directories(${GTEST_INCLUDE_DIRS} ./common)
# Collect testing framework source files
set (COMMON_SOURCE_FILES
common/main.cpp
common/CollectiveArgs.cpp
common/EnvVars.cpp
common/PrepDataFuncs.cpp
common/PtrUnion.cpp
common/TestBed.cpp
common/TestBedChild.cpp
)
# Collect source files for tests
if(BUILD_ALLREDUCE_ONLY)
set(TEST_SOURCES_SINGLE_PROCESS
test_AllReduce.cpp
test_AllReduceAbort.cpp
test_AllReduceGroup.cpp
set(TEST_SOURCE_FILES
AllReduce_Clique.cpp
AllReduce_GroupCall.cpp
AllReduce_InPlace.cpp
AllReduce_ManagedMem.cpp
AllReduce_OutOfPlace.cpp
AllReduce_PreMultScalar.cpp
)
else()
# Collect source files for tests
set(TEST_SOURCES_SINGLE_PROCESS
test_AllGather.cpp
test_AllReduce.cpp
test_AllReduceGroup.cpp
test_Broadcast.cpp
test_Reduce.cpp
test_ReduceScatter.cpp
test_GroupCalls.cpp
test_CombinedCalls.cpp
test_AllReduceAbort.cpp
test_BroadcastAbort.cpp
test_Scatter.cpp
test_Gather.cpp
test_AllToAll.cpp
test_AllToAllv.cpp
)
set(TEST_SOURCE_FILES
#AllReduce
AllReduce_Clique.cpp
AllReduce_GroupCall.cpp
AllReduce_InPlace.cpp
AllReduce_ManagedMem.cpp
AllReduce_OutOfPlace.cpp
AllReduce_PreMultScalar.cpp
#AllGather
AllGather_InPlace.cpp
AllGather_ManagedMem.cpp
AllGather_OutOfPlace.cpp
#AllToAll
AllToAll_OutOfPlace.cpp
AllToAll_ManagedMem.cpp
#Broadcast
Broadcast_InPlace.cpp
Broadcast_ManagedMem.cpp
Broadcast_OutOfPlace.cpp
#Reduce
Reduce_InPlace.cpp
Reduce_ManagedMem.cpp
Reduce_OutOfPlace.cpp
#ReduceScatter
ReduceScatter_InPlace.cpp
ReduceScatter_ManagedMem.cpp
ReduceScatter_OutOfPlace.cpp
#Scatter
Scatter_InPlace.cpp
Scatter_ManagedMem.cpp
Scatter_OutOfPlace.cpp
#Gather
Gather_InPlace.cpp
Gather_ManagedMem.cpp
Gather_OutOfPlace.cpp
)
endif()
if(BUILD_ALLREDUCE_ONLY)
set(TEST_SOURCES_MULTI_PROCESS
test_AllReduceMultiProcess.cpp
test_AllReduceGroupMultiProcess.cpp
)
else()
set(TEST_SOURCES_MULTI_PROCESS
test_AllGatherMultiProcess.cpp
test_AllReduceMultiProcess.cpp
test_AllReduceGroupMultiProcess.cpp
test_AllToAllMultiProcess.cpp
test_BroadcastMultiProcess.cpp
test_CombinedCallsMultiProcess.cpp
test_GatherMultiProcess.cpp
test_GroupCallsMultiProcess.cpp
test_ReduceMultiProcess.cpp
test_ReduceScatterMultiProcess.cpp
test_ScatterMultiProcess.cpp
)
endif()
add_executable(UnitTests ${TEST_SOURCES_SINGLE_PROCESS})
add_executable(UnitTests ${COMMON_SOURCE_FILES} ${TEST_SOURCE_FILES})
target_include_directories(UnitTests PRIVATE ${ROCM_PATH} ${GTEST_INCLUDE_DIRS})
target_link_libraries(UnitTests PRIVATE ${GTEST_BOTH_LIBRARIES})
target_link_libraries(UnitTests PRIVATE hip::host hip::device)
add_executable(UnitTestsMultiProcess ${TEST_SOURCES_MULTI_PROCESS})
target_include_directories(UnitTestsMultiProcess PRIVATE ${ROCM_PATH} ${GTEST_INCLUDE_DIRS})
target_link_libraries(UnitTestsMultiProcess PRIVATE ${GTEST_BOTH_LIBRARIES})
target_link_libraries(UnitTestsMultiProcess PRIVATE hip::host hip::device)
find_program( rocminfo_executable rocminfo )
execute_process(COMMAND bash "-c" "${rocminfo_executable} | grep 'Device Type' | grep GPU | wc -l | tr -d '\n'" OUTPUT_VARIABLE gtest_num_gpus)
if(${gtest_num_gpus} EQUAL "0" OR ${gtest_num_gpus} EQUAL "1")
set(gtest_num_gpus "2")
endif()
target_compile_options(UnitTests PRIVATE -DGTESTS_NUM_GPUS=${gtest_num_gpus})
target_link_libraries(UnitTests PRIVATE hip::host hip::device hsa-runtime64::hsa-runtime64)
# UnitTests using static library of rccl requires passing rccl
# through -l and -L instead of command line input.
if(BUILD_STATIC)
add_dependencies(UnitTests rccl)
target_link_libraries(UnitTests PRIVATE dl rt numa -lrccl -L${CMAKE_BINARY_DIR} -lrocm_smi64 -L${ROCM_PATH}/rocm_smi/lib)
add_dependencies(UnitTestsMultiProcess rccl)
target_link_libraries(UnitTestsMultiProcess PRIVATE dl rt numa -lrccl -L${CMAKE_BINARY_DIR} -lrocm_smi64 -L${ROCM_PATH}/rocm_smi/lib)
else()
target_link_libraries(UnitTests PRIVATE rccl)
target_link_libraries(UnitTestsMultiProcess PRIVATE rt rccl)
endif()
# HIPCC adds /opt/rocm/lib as RPATH, even though the install process is supposed to
# remove RPATH. It also occurs before any user-specified rpath, which effectively overrides the user rpath.
@@ -91,10 +107,8 @@ if(BUILD_TESTS)
if (CMAKE_INSTALL_PREFIX MATCHES "${ROCM_PATH}")
# install_prefix/CMAKE_INSTALL_PREFIX was not explicitly specified, so look in build/release
add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:${ROCM_PATH}/lib ${CMAKE_BINARY_DIR}/test/UnitTests)
add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:${ROCM_PATH}/lib ${CMAKE_BINARY_DIR}/test/UnitTestsMultiProcess)
else()
add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTests)
add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTestsMultiProcess)
endif()
else()
message("Not building unit tests")
File diff suppressed because it is too large Load Diff
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(Gather, InPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollGather};
std::vector<ncclDataType_t> const dataTypes = {ncclInt8, ncclInt32, ncclInt64};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {true};
std::vector<bool> const managedMemList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(Gather, ManagedMem)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollGather};
std::vector<ncclDataType_t> const dataTypes = {ncclUint8, ncclUint32, ncclUint64};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {1};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {true};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(Gather, OutOfPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollGather};
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {1};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(ReduceScatter, InPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollReduceScatter};
std::vector<ncclDataType_t> const dataTypes = {ncclInt8, ncclInt32, ncclInt64};
std::vector<ncclRedOp_t> const redOps = {ncclSum, ncclProd};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {true};
std::vector<bool> const managedMemList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(ReduceScatter, ManagedMem)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollReduceScatter};
std::vector<ncclDataType_t> const dataTypes = {ncclUint8, ncclUint32, ncclUint64};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {true};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(ReduceScatter, OutOfPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollReduceScatter};
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16};
std::vector<ncclRedOp_t> const redOps = {ncclMin, ncclMax, ncclAvg};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(Reduce, InPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollReduce};
std::vector<ncclDataType_t> const dataTypes = {ncclInt8, ncclInt32, ncclInt64};
std::vector<ncclRedOp_t> const redOps = {ncclSum, ncclProd};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {true};
std::vector<bool> const managedMemList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(Reduce, ManagedMem)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollReduce};
std::vector<ncclDataType_t> const dataTypes = {ncclUint8, ncclUint32, ncclUint64};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {true};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(Reduce, OutOfPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollReduce};
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16};
std::vector<ncclRedOp_t> const redOps = {ncclMin, ncclMax, ncclAvg};
std::vector<int> const roots = {1};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(Scatter, InPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollScatter};
std::vector<ncclDataType_t> const dataTypes = {ncclInt8, ncclInt32, ncclInt64};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {true};
std::vector<bool> const managedMemList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(Scatter, ManagedMem)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollScatter};
std::vector<ncclDataType_t> const dataTypes = {ncclUint8, ncclUint32, ncclUint64};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {true};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBed.hpp"
namespace RcclUnitTesting
{
TEST(Scatter, OutOfPlace)
{
TestBed testBed;
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollScatter};
std::vector<ncclDataType_t> const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {1};
std::vector<int> const numElements = {1048576, 53327, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {false};
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
testBed.Finalize();
}
}
-63
Näytä tiedosto
@@ -1,63 +0,0 @@
#ifndef TESTCHECKS_HPP
#define TESTCHECKS_HPP
#define HIP_CALL(x) ASSERT_EQ(x, hipSuccess)
#define NCCL_CALL(x) ASSERT_EQ(x, ncclSuccess)
#define SYSCHECK_TEST(call, name) do { \
int retval; \
SYSCHECKVAL_TEST(call, name, retval); \
} while (false)
#define SYSCHECKVAL_TEST(call, name, retval) do { \
SYSCHECKSYNC_TEST(call, name, retval); \
if (retval == -1) { \
printf("Call to %s failed : %s\n", name, strerror(errno)); \
fflush(stdout); \
return ncclSystemError; \
} \
} while (false)
#define SYSCHECK_GOTO_TEST(call, name, label) do { \
int retval; \
SYSCHECKVAL_GOTO_TEST(call, name, retval, label); \
} while (false)
#define SYSCHECKVAL_GOTO_TEST(call, name, retval, label) do { \
SYSCHECKSYNC_TEST(call, name, retval); \
if (retval == -1) { \
printf("Call to %s failed : %s\n", name, strerror(errno)); \
fflush(stdout); \
goto label; \
} \
} while (false)
#define SYSCHECKSYNC_TEST(call, name, retval) do { \
retval = call; \
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
} else { \
break; \
} \
} while(true)
#define NCCLCHECK_BARRIER_TEST(call, name, rank) do { \
ncclResult_t retval; \
retval = call; \
if (retval != ncclSuccess) { \
printf("Rank %d call to %s failed : %s\n", rank, name, strerror(errno)); \
fflush(stdout); \
return; \
} \
} while (false)
#define NCCLCHECK_TEST(call, name) do { \
ncclResult_t retval; \
retval = call; \
if (retval != ncclSuccess) { \
printf("Call to %s failed : %s\n", name, strerror(errno)); \
fflush(stdout); \
return retval; \
} \
} while (false)
#endif
+282
Näytä tiedosto
@@ -0,0 +1,282 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "CollectiveArgs.hpp"
#include "gtest/gtest.h"
namespace RcclUnitTesting
{
ErrCode CollectiveArgs::SetArgs(int const globalRank,
int const totalRanks,
int const deviceId,
ncclFunc_t const funcType,
ncclDataType_t const dataType,
ncclRedOp_t const redOp,
int const root,
size_t const numInputElements,
size_t const numOutputElements,
ScalarTransport const scalarTransport,
int const scalarMode)
{
// Free scalar based on previous scalarMode
if (scalarMode != -1)
{
if (this->localScalar.ptr != nullptr)
{
if (this->scalarMode == 0) this->localScalar.FreeGpuMem();
if (this->scalarMode == 1) hipHostFree(this->localScalar.ptr);
}
}
this->globalRank = globalRank;
this->totalRanks = totalRanks;
this->deviceId = deviceId;
this->funcType = funcType;
this->dataType = dataType;
this->redOp = redOp;
this->root = root;
this->numInputElements = numInputElements;
this->numOutputElements = numOutputElements;
this->scalarTransport = scalarTransport;
this->scalarMode = scalarMode;
if (scalarMode != -1)
{
size_t const numBytes = DataTypeToBytes(dataType);
if (scalarMode == ncclScalarDevice)
{
CHECK_CALL(this->localScalar.AllocateGpuMem(numBytes));
CHECK_HIP(hipMemcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes),
numBytes, hipMemcpyHostToDevice));
}
else if (scalarMode == ncclScalarHostImmediate)
{
CHECK_HIP(hipHostMalloc(&this->localScalar.ptr, numBytes, 0));
memcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes), numBytes);
}
}
return TEST_SUCCESS;
}
ErrCode CollectiveArgs::AllocateMem(bool const inPlace,
bool const useManagedMem)
{
this->numInputBytesAllocated = this->numInputElements * DataTypeToBytes(this->dataType);
this->numOutputBytesAllocated = this->numOutputElements * DataTypeToBytes(this->dataType);
this->numInputElementsAllocated = this->numInputElements;
this->numOutputElementsAllocated = this->numOutputElements;
this->inPlace = inPlace;
this->useManagedMem = useManagedMem;
if (hipSetDevice(this->deviceId) != hipSuccess)
{
ERROR("Unable to call hipSetDevice to set to GPU %d\n", this->deviceId);
return TEST_FAIL;
}
if (inPlace)
{
if (this->funcType == ncclCollScatter)
{
CHECK_CALL(this->inputGpu.AllocateGpuMem(this->numInputBytesAllocated, useManagedMem));
this->outputGpu.Attach(this->inputGpu.U1 + (this->globalRank * this->numOutputBytesAllocated));
}
else if (this->funcType == ncclCollGather)
{
CHECK_CALL(this->outputGpu.AllocateGpuMem(this->numOutputBytesAllocated, useManagedMem));
this->inputGpu.Attach(this->outputGpu.U1 + (this->globalRank * this->numInputBytesAllocated));
}
else
{
size_t const numBytes = std::max(this->numInputBytesAllocated, this->numOutputBytesAllocated);
CHECK_CALL(this->inputGpu.AllocateGpuMem(numBytes, useManagedMem));
this->outputGpu.Attach(this->inputGpu.ptr);
}
CHECK_CALL(this->expected.AllocateCpuMem(this->numOutputBytesAllocated));
}
else
{
CHECK_CALL(this->inputGpu.AllocateGpuMem(this->numInputBytesAllocated, useManagedMem));
CHECK_CALL(this->outputGpu.AllocateGpuMem(this->numOutputBytesAllocated, useManagedMem));
CHECK_CALL(this->expected.AllocateCpuMem(this->numOutputBytesAllocated));
}
CHECK_CALL(this->outputCpu.AllocateCpuMem(this->numOutputBytesAllocated));
return TEST_SUCCESS;
}
ErrCode CollectiveArgs::PrepareData(CollFuncPtr const prepareDataFunc)
{
CollFuncPtr prepFunc = (prepareDataFunc == nullptr ? DefaultPrepareDataFunc : prepareDataFunc);
return prepFunc(*this);
}
ErrCode CollectiveArgs::ValidateResults()
{
// Ignore non-root outputs for collectives with a root
if (CollectiveArgs::UsesRoot(this->funcType) && this->root != this->globalRank) return TEST_SUCCESS;
size_t const numOutputBytes = (this->numOutputElements * DataTypeToBytes(this->dataType));
CHECK_HIP(hipMemcpy(this->outputCpu.ptr, this->outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost));
bool isMatch = true;
CHECK_CALL(this->outputCpu.IsEqual(this->dataType,
this->numOutputElements,
this->expected,
true,
isMatch));
if (!isMatch) ERROR("Mismatch for %s\n", this->GetDescription().c_str());
return isMatch ? TEST_SUCCESS : TEST_FAIL;
}
ErrCode CollectiveArgs::DeallocateMem()
{
// If in-place, either only inputGpu or outputGpu was allocated
if (this->inPlace)
{
if (this->funcType == ncclCollGather)
this->outputGpu.FreeGpuMem();
else
this->inputGpu.FreeGpuMem();
}
else
{
this->inputGpu.FreeGpuMem();
this->outputGpu.FreeGpuMem();
}
this->outputCpu.FreeCpuMem();
this->expected.FreeCpuMem();
if (this->localScalar.ptr != nullptr)
{
if (this->scalarMode == 0) this->localScalar.FreeGpuMem();
if (this->scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr));
}
return TEST_SUCCESS;
}
std::string CollectiveArgs::GetDescription() const
{
std::stringstream ss;
ss << "(Rank " << this->globalRank << ") ";
switch (this->funcType)
{
case ncclCollBroadcast: ss << "ncclBroadcast"; break;
case ncclCollReduce: ss << "ncclReduce"; break;
case ncclCollAllGather: ss << "ncclAllGather"; break;
case ncclCollReduceScatter: ss << "ncclReduceScatter"; break;
case ncclCollAllReduce: ss << "ncclAllReduce"; break;
case ncclCollGather: ss << "ncclGather"; break;
case ncclCollScatter: ss << "ncclScatter"; break;
case ncclCollAllToAll: ss << "ncclAllToAll"; break;
case ncclCollSend: ss << "ncclSend"; break;
case ncclCollRecv: ss << "ncclRevv"; break;
default: ss << "[Unknown]"; break;
}
ss << " " << ncclDataTypeNames[this->dataType] << " ";
if (this->funcType == ncclCollReduce ||
this->funcType == ncclCollReduceScatter ||
this->funcType == ncclCollAllReduce)
{
if (this->redOp < ncclNumOps)
{
ss << ncclRedOpNames[this->redOp] << " ";
}
else
{
ss << "CustomScalar ";
PtrUnion scalarsPerRank;
scalarsPerRank.Attach(scalarsPerRank.ptr);
switch (this->dataType)
{
case ncclInt8: ss << scalarsPerRank.I1[this->globalRank]; break;
case ncclUint8: ss << scalarsPerRank.U1[this->globalRank]; break;
case ncclInt32: ss << scalarsPerRank.I4[this->globalRank]; break;
case ncclUint32: ss << scalarsPerRank.U4[this->globalRank]; break;
case ncclInt64: ss << scalarsPerRank.I8[this->globalRank]; break;
case ncclUint64: ss << scalarsPerRank.U8[this->globalRank]; break;
case ncclFloat32: ss << scalarsPerRank.F4[this->globalRank]; break;
case ncclFloat64: ss << scalarsPerRank.F8[this->globalRank]; break;
case ncclBfloat16: ss << scalarsPerRank.B2[this->globalRank]; break;
default: ss << "(UNKNOWN)";
}
ss << " ";
}
}
if (this->funcType == ncclCollBroadcast ||
this->funcType == ncclCollReduce ||
this->funcType == ncclCollGather ||
this->funcType == ncclCollScatter)
{
ss << "Root " << this->root << " ";
}
if (this->funcType == ncclCollSend ||
this->funcType == ncclCollRecv)
{
ss << "Peer " << this->root << " ";
}
ss << "#In: " << this->numInputElements;
ss << " #Out: " << this->numOutputElements;
return ss.str();
}
void CollectiveArgs::GetNumElementsForFuncType(ncclFunc_t const funcType,
int const N,
int const totalRanks,
int* numInputElements,
int* numOutputElements)
{
switch (funcType)
{
case ncclCollBroadcast:
case ncclCollReduce:
case ncclCollAllReduce:
*numInputElements = N;
*numOutputElements = N;
break;
case ncclCollGather:
case ncclCollAllGather:
*numInputElements = N;
*numOutputElements = totalRanks * N;
break;
case ncclCollScatter:
case ncclCollReduceScatter:
*numInputElements = totalRanks * N;
*numOutputElements = N;
break;
case ncclCollAllToAll:
*numInputElements = totalRanks * N;
*numOutputElements = totalRanks * N;
break;
default:
*numInputElements = N;
*numOutputElements = N;
break;
}
}
bool CollectiveArgs::UsesReduce(ncclFunc_t const funcType)
{
return (funcType == ncclCollReduce ||
funcType == ncclCollAllReduce ||
funcType == ncclCollReduceScatter);
}
bool CollectiveArgs::UsesRoot(ncclFunc_t const funcType)
{
return (funcType == ncclCollBroadcast ||
funcType == ncclCollReduce ||
funcType == ncclCollGather ||
funcType == ncclCollScatter);
}
}
+151
Näytä tiedosto
@@ -0,0 +1,151 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
b
* See LICENSE.txt for license information
************************************************************************/
#pragma once
#include "PtrUnion.hpp"
#include "PrepDataFuncs.hpp"
#include "rccl.h"
namespace RcclUnitTesting
{
// Enumeration of all collective functions currently supported
typedef enum
{
ncclCollBroadcast = 0,
ncclCollReduce,
ncclCollAllGather,
ncclCollReduceScatter,
ncclCollAllReduce,
ncclCollGather,
ncclCollScatter,
ncclCollAllToAll,
ncclCollSend,
ncclCollRecv,
ncclNumFuncs
} ncclFunc_t;
char const ncclFuncNames[ncclNumFuncs][32] =
{
"Broadcast",
"Reduce",
"AllGather",
"ReduceScatter",
"AllReduce",
"Gather",
"Scatter",
"AllToAll",
"Send",
"Recv"
};
char const ncclDataTypeNames[ncclNumTypes][32] =
{
"ncclInt8",
"ncclUint8",
"ncclInt32",
"ncclUint32",
"ncclInt64",
"ncclUint64",
"ncclFloat16",
"ncclFloat32",
"ncclFloat64",
"ncclBfloat16"
};
char const ncclRedOpNames[ncclNumOps][32] =
{
"sum",
"prod",
"max",
"min",
"avg"
};
class CollectiveArgs;
#define MAX_RANKS 32
struct ScalarTransport
{
char ptr[MAX_RANKS * sizeof(double)];
};
// Function pointer for functions that operate on CollectiveArgs
// e.g. For filling input / computing expected results
typedef ErrCode (*CollFuncPtr)(CollectiveArgs &);
class CollectiveArgs
{
public:
// Arguments to execute
int globalRank;
int totalRanks;
int deviceId;
ncclFunc_t funcType;
ncclDataType_t dataType;
ncclRedOp_t redOp;
int root; // Used as "peer" for Send/Recv
size_t numInputElements;
size_t numOutputElements;
ScalarTransport scalarTransport; // Used for custom reduction operators
PtrUnion localScalar;
int scalarMode; // -1 if scalar not used
// Data
PtrUnion inputGpu;
PtrUnion outputGpu;
PtrUnion outputCpu;
PtrUnion expected;
bool inPlace;
bool useManagedMem;
size_t numInputBytesAllocated;
size_t numOutputBytesAllocated;
size_t numInputElementsAllocated;
size_t numOutputElementsAllocated;
// Set collective arguments
ErrCode SetArgs(int const globalRank,
int const totalRanks,
int const deviceId,
ncclFunc_t const funcType,
ncclDataType_t const dataType,
ncclRedOp_t const redOp,
int const root,
size_t const numInputElements,
size_t const numOutputElements,
ScalarTransport const scalarsPerRank,
int const scalarMode = -1);
// Allocates GPU memory for input/output and CPU memory for expected
// When inPlace is true, input and output share the same memory
ErrCode AllocateMem(bool const inPlace,
bool const useManagedMem);
// Execute the provided data preparation function to fill input and compute expected results
ErrCode PrepareData(CollFuncPtr const prepareDataFunc);
// Compare outputs to expected values
ErrCode ValidateResults();
// Deallocate memory
ErrCode DeallocateMem();
// Provide a description for the current collective arguments
std::string GetDescription() const;
// Returns the number of inputs/outputs based on collective function type
static void GetNumElementsForFuncType(ncclFunc_t const funcType,
int const N,
int const totalRanks,
int* numInputElements,
int* numOutputElements);
// Returns true if collective function performs reduction
static bool UsesReduce(ncclFunc_t const funcType);
// Returns true if collective function utilizes a root rank
static bool UsesRoot(ncclFunc_t const funcType);
};
}
+161
Näytä tiedosto
@@ -0,0 +1,161 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "EnvVars.hpp"
#include "CollectiveArgs.hpp"
#include <cstdlib>
namespace RcclUnitTesting
{
int const UT_SINGLE_PROCESS = (1<<0);
int const UT_MULTI_PROCESS = (1<<1);
hsa_status_t CountGpus(hsa_agent_t agent, void* data)
{
int* currCount = (int*)data;
hsa_device_type_t device;
hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device);
if (device == HSA_DEVICE_TYPE_GPU)
*currCount = *currCount + 1;
return HSA_STATUS_SUCCESS;
}
EnvVars::EnvVars()
{
// Collect number of GPUs available
// NOTE: Cannot use HIP call prior to launching child processes via fork so use HSA
int numDevicesAvailable = 0;
hsa_init();
hsa_iterate_agents(CountGpus, &numDevicesAvailable);
hsa_shut_down();
showNames = GetEnvVar("UT_SHOW_NAMES" , 1);
minGpus = GetEnvVar("UT_MIN_GPUS" , 2);
maxGpus = GetEnvVar("UT_MAX_GPUS" , numDevicesAvailable);
processMask = GetEnvVar("UT_PROCESS_MASK", UT_SINGLE_PROCESS | UT_MULTI_PROCESS);
verbose = GetEnvVar("UT_VERBOSE" , 0);
printValues = GetEnvVar("UT_PRINT_VALUES", 0);
// Limit number of supported reduction operators to just ncclSum if only allReduce is built
#ifdef BUILD_ALLREDUCE_ONLY
int numOps = 1;
#else
int numOps = ncclNumOps;
#endif
std::vector<std::string> redOpStrings = GetEnvVarsList("UT_REDOPS");
for (auto s : redOpStrings)
{
for (int i = 0; i < numOps; ++i)
{
if (!strcmp(s.c_str(), ncclRedOpNames[i]))
{
redOps.push_back((ncclRedOp_t)i);
break;
}
}
}
// Default back to all ops if no strings are found
if (redOps.empty())
{
for (int i = 0; i < numOps; i++)
redOps.push_back((ncclRedOp_t)i);
}
// Limit number of supported datatypes if only allReduce is built
std::vector<std::string> dtStrings = GetEnvVarsList("UT_DATATYPES");
for (auto s : dtStrings)
{
for (int i = 0; i < ncclNumTypes; ++i)
{
if (!strcmp(s.c_str(), ncclDataTypeNames[i]))
{
#ifdef BUILD_ALLREDUCE_ONLY
if (i == ncclFloat32)
#endif
{
dataTypes.push_back((ncclDataType_t)i);
}
}
}
}
// Default option if no valid datatypes are found in env var
if (dataTypes.empty())
{
dataTypes.push_back(ncclFloat32);
// Skip all but 32-bit floats if only AllReduce is being built
#ifndef BUILD_ALLREDUCE_ONLY
dataTypes.push_back(ncclInt8);
dataTypes.push_back(ncclUint8);
dataTypes.push_back(ncclInt32);
dataTypes.push_back(ncclUint32);
dataTypes.push_back(ncclInt64);
dataTypes.push_back(ncclUint64);
// Half-precision floats disabled due to lack of host-side support
// dataTypes.push_back(ncclFloat16);
dataTypes.push_back(ncclFloat32);
dataTypes.push_back(ncclFloat64);
dataTypes.push_back(ncclBfloat16);
#endif
}
}
std::vector<ncclRedOp_t> const& EnvVars::GetAllSupportedRedOps()
{
return redOps;
}
std::vector<ncclDataType_t> const& EnvVars::GetAllSupportedDataTypes()
{
return dataTypes;
}
int EnvVars::GetEnvVar(std::string const varname, int defaultValue)
{
if (getenv(varname.c_str()))
return atoi(getenv(varname.c_str()));
return defaultValue;
};
std::vector<std::string> EnvVars::GetEnvVarsList(std::string const varname)
{
std::vector<std::string> result;
if (getenv(varname.c_str()))
{
char* token = strtok(getenv(varname.c_str()), ",;");
while (token != NULL)
{
result.push_back(token);
token = strtok(NULL, ",;");
}
}
return result;
}
void EnvVars::ShowConfig()
{
std::vector<std::pair<std::string, std::string>> supported =
{
std::make_pair("UT_SHOW_NAMES" , "Show test case names"),
std::make_pair("UT_MIN_GPUS" , "Minimum number of GPUs to use"),
std::make_pair("UT_MAX_GPUS" , "Maximum number of GPUs to use"),
std::make_pair("UT_PROCESS_MASK", "Whether to run single/multi process"),
std::make_pair("UT_VERBOSE" , "Show verbose unit test output"),
std::make_pair("UT_REDOPS" , "List of reduction ops to test"),
std::make_pair("UT_DATATYPES" , "List of datatypes to test"),
std::make_pair("UT_PRINT_VALUES", "Print array values (# of values to print, < 0 for all)")
};
printf("================================================================================\n");
printf(" Environment variables:\n");
for (auto p : supported)
{
printf(" - %-20s %-40s %s\n", p.first.c_str(), p.second.c_str(),
getenv(p.first.c_str()) ? getenv(p.first.c_str()) : "<unset>");
}
printf("================================================================================\n");
}
}
+44
Näytä tiedosto
@@ -0,0 +1,44 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#pragma once
#include <hsa/hsa.h>
#include <vector>
#include "rccl.h"
namespace RcclUnitTesting
{
// Helper function to count the number of GPUs on system
static hsa_status_t CountGpus(hsa_agent_t agent, void* data);
// Helper class to track environment variables that affect the unit tests
class EnvVars
{
public:
bool showNames; // List test case names during run [UT_SHOW_NAMES]
int minGpus; // Set the minimum number of GPUs to use [UT_MIN_GPUS]
int maxGpus; // Set the maximum number of GPUs to use [UT_MAX_GPUS]
int processMask; // Filter single/multi process [UT_PROCESS_MASK]
bool verbose; // Show verbose TestBed output for debug [UT_VERBOSE]
int printValues; // Print out input/output/expected arrays [UT_PRINT_VALUES]
// Constructor that parses and collects environment variables
EnvVars();
std::vector<ncclRedOp_t> const& GetAllSupportedRedOps();
std::vector<ncclDataType_t> const& GetAllSupportedDataTypes();
static void ShowConfig();
protected:
std::vector<ncclRedOp_t> redOps; // Supported reduction ops [UT_REDOPS]
std::vector<ncclDataType_t> dataTypes; // Support datatypes [UT_DATATYPES]
// Helper functions to parse environment variables
int GetEnvVar(std::string const varname, int defaultValue);
std::vector<std::string> GetEnvVarsList(std::string const varname);
};
}
+38
Näytä tiedosto
@@ -0,0 +1,38 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#pragma once
namespace RcclUnitTesting
{
typedef enum
{
TEST_SUCCESS = 0,
TEST_FAIL = 1
} ErrCode;
#define ERROR(...) printf("\033[0;31m" "[ ERROR ] " "\033[0m" __VA_ARGS__)
#define INFO(...) printf("[ INFO ] " __VA_ARGS__)
#define CHECK_CALL(func) \
{ \
ErrCode status = func; \
if (status != TEST_SUCCESS) \
{ \
ERROR("Error in call %s\n", #func); \
return status; \
} \
}
#define CHECK_HIP(func) \
{ \
hipError_t error = (func); \
if (error != hipSuccess) \
{ \
fprintf(stderr, "\033[0;33" "[ ERROR ] HIP error: %s\n" "\033[m", hipGetErrorString(error)); \
return TEST_FAIL; \
} \
}
}
+342
Näytä tiedosto
@@ -0,0 +1,342 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "CollectiveArgs.hpp"
#include "PrepDataFuncs.hpp"
#include <cstdio>
#include <hip/hip_runtime.h>
namespace RcclUnitTesting
{
ErrCode DefaultPrepareDataFunc(CollectiveArgs &collArgs)
{
switch (collArgs.funcType)
{
case ncclCollBroadcast: return DefaultPrepData_Broadcast(collArgs);
case ncclCollReduce: return DefaultPrepData_Reduce(collArgs, false);
case ncclCollAllGather: return DefaultPrepData_Gather(collArgs, true);
case ncclCollReduceScatter: return DefaultPrepData_ReduceScatter(collArgs);
case ncclCollAllReduce: return DefaultPrepData_Reduce(collArgs, true);
case ncclCollGather: return DefaultPrepData_Gather(collArgs, false);
case ncclCollScatter: return DefaultPrepData_Scatter(collArgs);
case ncclCollAllToAll: return DefaultPrepData_AllToAll(collArgs);
//case ncclCollSendRecv: return DefaultPrepData_SendRecv(collArgs);
default:
ERROR("Unknown func type %d\n", collArgs.funcType);
return TEST_FAIL;
}
}
ErrCode CheckAllocation(CollectiveArgs const& collArgs)
{
if (collArgs.numInputElements > collArgs.numInputElementsAllocated)
{
ERROR("Number of input elements (%lu) exceeds the number of allocated input elements (%lu)\n",
collArgs.numInputElements, collArgs.numInputElementsAllocated);
return TEST_FAIL;
}
if (collArgs.numOutputElements > collArgs.numOutputElementsAllocated)
{
ERROR("Number of output elements (%lu) exceeds the number of allocated output elements (%lu)\n",
collArgs.numOutputElements, collArgs.numOutputElementsAllocated);
return TEST_FAIL;
}
return TEST_SUCCESS;
}
ErrCode DefaultPrepData_Broadcast(CollectiveArgs &collArgs)
{
CHECK_CALL(CheckAllocation(collArgs));
if (collArgs.numInputElements != collArgs.numOutputElements)
{
ERROR("Number of input elements must match number of output elements for Broadcast\n");
return TEST_FAIL;
}
size_t const numBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
// Clear output for all ranks (done before filling input in case of in-place)
CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numBytes));
// Only root needs input pattern
if (collArgs.globalRank == collArgs.root)
CHECK_CALL(collArgs.inputGpu.FillPattern(collArgs.dataType,
collArgs.numInputElements,
collArgs.root, true));
// Otherwise all other ranks expected output is the same as input of root
return collArgs.expected.FillPattern(collArgs.dataType,
collArgs.numInputElements,
collArgs.root,
false);
}
ErrCode DefaultPrepData_Reduce(CollectiveArgs &collArgs, bool const isAllReduce)
{
CHECK_CALL(CheckAllocation(collArgs));
if (collArgs.numInputElements != collArgs.numOutputElements)
{
ERROR("Number of input elements must match number of output elements for Reduce\n");
return TEST_FAIL;
}
size_t const numBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
// Clear output for all ranks (done before filling input in case of in-place)
CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numBytes));
// Clear expected buffer for holding reduction
PtrUnion result;
CHECK_CALL(result.Attach(collArgs.expected));
CHECK_CALL(result.ClearCpuMem(numBytes));
// If average or custom reduction operator is used, perform a summation instead
ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp);
// Loop over each rank and generate their input into a temp buffer, then reduce
PtrUnion scalarsPerRank;
scalarsPerRank.Attach(collArgs.scalarTransport.ptr);
PtrUnion tempInputCpu;
CHECK_CALL(tempInputCpu.Attach(collArgs.outputCpu));
for (int rank = 0; rank < collArgs.totalRanks; ++rank)
{
// Generate temporary input for this rank
CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
// Copy the pre-scaled input into GPU memory for the correct rank
if (rank == collArgs.globalRank)
{
CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numBytes, hipMemcpyHostToDevice));
}
// Scale the temporary input by local scalar for this rank
// (Used by custom reduction ops)
if (collArgs.scalarMode >= 0)
{
CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements,
scalarsPerRank, rank));
}
// Any rank that requires output reduces the scaled-inputs
if (isAllReduce || collArgs.root == collArgs.globalRank)
{
if (rank == 0)
{
memcpy(result.ptr, tempInputCpu.ptr, numBytes);
}
else
{
CHECK_CALL(result.Reduce(collArgs.dataType, collArgs.numInputElements,
tempInputCpu, tempOp));
}
}
}
// Perform averaging if necessary
if (collArgs.redOp == ncclAvg && (isAllReduce || collArgs.root == collArgs.globalRank))
{
CHECK_CALL(result.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
}
return TEST_SUCCESS;
}
ErrCode DefaultPrepData_Gather(CollectiveArgs &collArgs, bool const isAllGather)
{
CHECK_CALL(CheckAllocation(collArgs));
if (collArgs.totalRanks * collArgs.numInputElements != collArgs.numOutputElements)
{
ERROR("# of output elements must be total ranks * # input elements for AllGather\n");
return TEST_FAIL;
}
// Clear output for all ranks (done before filling input in case of in-place)
size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
CHECK_CALL(collArgs.inputGpu.ClearGpuMem(numInputBytes));
CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numOutputBytes));
PtrUnion result;
CHECK_CALL(result.Attach(collArgs.expected.ptr));
CHECK_CALL(result.ClearCpuMem(numOutputBytes));
// Use outputCpu buffer to store temporary input
PtrUnion tempInputCpu;
CHECK_CALL(tempInputCpu.Attach(collArgs.outputCpu.ptr));
for (int rank = 0; rank < collArgs.totalRanks; ++rank)
{
CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
if (rank == collArgs.globalRank)
{
CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numInputBytes, hipMemcpyHostToDevice));
}
if (isAllGather || collArgs.root == collArgs.globalRank)
{
memcpy(result.I1 + (rank * numInputBytes), tempInputCpu.ptr, numInputBytes);
}
}
return TEST_SUCCESS;
}
ErrCode DefaultPrepData_ReduceScatter(CollectiveArgs &collArgs)
{
CHECK_CALL(CheckAllocation(collArgs));
if (collArgs.numInputElements != collArgs.numOutputElements * collArgs.totalRanks)
{
ERROR("# of input elements must be total ranks * # output elements for ReduceScatter\n");
return TEST_FAIL;
}
size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
// Clear output for all ranks (done before filling input in case of in-place)
CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numOutputBytes));
PtrUnion tempInputCpu;
PtrUnion tempResultCpu;
CHECK_CALL(tempInputCpu.AllocateCpuMem(numInputBytes));
CHECK_CALL(tempResultCpu.AllocateCpuMem(numInputBytes));
CHECK_CALL(tempResultCpu.ClearCpuMem(numInputBytes));
// If average or custom reduction operator is used, perform a summation instead
ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp);
// Loop over each rank and generate the input / scale / reduce
PtrUnion scalarsPerRank;
scalarsPerRank.Attach(collArgs.scalarTransport.ptr);
for (int rank = 0; rank < collArgs.totalRanks; ++rank)
{
CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
if (rank == collArgs.globalRank)
{
if (hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numInputBytes, hipMemcpyHostToDevice) != hipSuccess)
{
ERROR("hipMemcpy to input failed\n");
CHECK_CALL(tempInputCpu.FreeCpuMem());
CHECK_CALL(tempResultCpu.FreeCpuMem());
return TEST_FAIL;
}
}
// Scale the temporary input by local scalar for this rank
// (Used by custom reduction ops)
if (collArgs.scalarMode >= 0)
{
CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements,
scalarsPerRank, rank));
}
if (rank == 0)
{
memcpy(tempResultCpu.ptr, tempInputCpu.ptr, numInputBytes);
}
else
{
CHECK_CALL(tempResultCpu.Reduce(collArgs.dataType, collArgs.numInputElements,
tempInputCpu, tempOp));
}
}
// Perform averaging if necessary
if (collArgs.redOp == ncclAvg)
{
CHECK_CALL(tempResultCpu.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
}
// Copy over portion of result
memcpy(collArgs.expected.I1,
tempResultCpu.I1 + collArgs.globalRank * numOutputBytes,
numOutputBytes);
CHECK_CALL(tempInputCpu.FreeCpuMem());
CHECK_CALL(tempResultCpu.FreeCpuMem());
return TEST_SUCCESS;
}
ErrCode DefaultPrepData_Scatter(CollectiveArgs &collArgs)
{
CHECK_CALL(CheckAllocation(collArgs));
if (collArgs.numInputElements != collArgs.numOutputElements * collArgs.totalRanks)
{
ERROR("# of input elements must be total ranks * # output elements for Scatter\n");
return TEST_FAIL;
}
size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
// Clear outputs on all ranks (prior to input in case of in-place)
collArgs.outputGpu.ClearGpuMem(numOutputBytes);
// Generate input as if on root rank - each rank will receive a portion
PtrUnion tempInput;
tempInput.AllocateCpuMem(numInputBytes);
tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.root, false);
// Copy input to root rank
if (collArgs.globalRank == collArgs.root)
{
if (hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice) != hipSuccess)
{
ERROR("hipMemcpy to input failed\n");
tempInput.FreeCpuMem();
return TEST_FAIL;
}
}
else
{
collArgs.inputGpu.ClearGpuMem(numInputBytes);
}
// Each rank receive a portion of the input
memcpy(collArgs.expected.U1, tempInput.U1 + (collArgs.globalRank * numOutputBytes), numOutputBytes);
tempInput.FreeCpuMem();
return TEST_SUCCESS;
}
ErrCode DefaultPrepData_AllToAll(CollectiveArgs &collArgs)
{
CHECK_CALL(CheckAllocation(collArgs));
if (collArgs.numInputElements != collArgs.numOutputElements)
{
ERROR("Number of input elements must match number of output elements for AllToAll\n");
return TEST_FAIL;
}
if (collArgs.numInputElements % collArgs.totalRanks)
{
ERROR("Input / Output size for AllToAll must be a multiple of %d\n", collArgs.totalRanks);
return TEST_FAIL;
}
size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
size_t const numBytes = numInputBytes / collArgs.totalRanks;
// Clear outputs on all ranks (prior to input in case of in-place)
collArgs.outputGpu.ClearGpuMem(numOutputBytes);
// Generate input on root rank - each rank will receive a portion
PtrUnion tempInput;
tempInput.Attach(collArgs.outputCpu);
for (int rank = 0; rank < collArgs.totalRanks; ++rank)
{
tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false);
// Copy input
if (rank == collArgs.globalRank)
{
CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice));
}
memcpy(collArgs.expected.U1 + (numBytes * rank), tempInput.U1 + (numBytes * collArgs.globalRank), numBytes);
}
return TEST_SUCCESS;
}
}
+26
Näytä tiedosto
@@ -0,0 +1,26 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#pragma once
#include "ErrCode.hpp"
namespace RcclUnitTesting
{
class CollectiveArgs;
// Checks that enough memory has been allocated
ErrCode CheckAllocation(CollectiveArgs const& collArgs);
// Default PrepareData functions
// PrepareData functions are responsible for setting up input / expected for the given collArgs
ErrCode DefaultPrepareDataFunc(CollectiveArgs &collArgs);
ErrCode DefaultPrepData_Broadcast(CollectiveArgs &collArgs);
ErrCode DefaultPrepData_Reduce(CollectiveArgs &collArgs, bool const isAllReduce);
ErrCode DefaultPrepData_Gather(CollectiveArgs &collArgs, bool const isAllGather);
ErrCode DefaultPrepData_ReduceScatter(CollectiveArgs &collArgs);
ErrCode DefaultPrepData_Scatter(CollectiveArgs &collArgs);
ErrCode DefaultPrepData_AllToAll(CollectiveArgs &collArgs);
ErrCode DefaultPrepData_SendRecv(CollectiveArgs &collArgs);
}
+354
Näytä tiedosto
@@ -0,0 +1,354 @@
#include "PtrUnion.hpp"
namespace RcclUnitTesting
{
size_t DataTypeToBytes(ncclDataType_t const dataType)
{
switch (dataType)
{
case ncclInt8: return 1;
case ncclUint8: return 1;
case ncclInt32: return 4;
case ncclUint32: return 4;
case ncclInt64: return 8;
case ncclUint64: return 8;
case ncclFloat16: return 2;
case ncclFloat32: return 4;
case ncclFloat64: return 8;
case ncclBfloat16: return 2;
default:
ERROR("Unsupported datatype (%d)\n", dataType);
exit(0);
}
}
ErrCode PtrUnion::Attach(void *ptr)
{
this->ptr = ptr;
return TEST_SUCCESS;
}
ErrCode PtrUnion::Attach(PtrUnion ptrUnion)
{
this->ptr = ptrUnion.ptr;
return TEST_SUCCESS;
}
ErrCode PtrUnion::AllocateGpuMem(size_t const numBytes, bool const useManagedMem)
{
if (numBytes)
{
if (useManagedMem)
{
if (hipMallocManaged(&I1, numBytes) != hipSuccess)
{
ERROR("Unable to allocate managed memory of GPU memory (%lu bytes)\n", numBytes);
return TEST_FAIL;
}
}
else
{
if (hipMalloc(&I1, numBytes) != hipSuccess)
{
ERROR("Unable to allocate memory of GPU memory (%lu bytes)\n", numBytes);
return TEST_FAIL;
}
}
}
return TEST_SUCCESS;
}
ErrCode PtrUnion::AllocateCpuMem(size_t const numBytes)
{
if (numBytes)
{
this->ptr = calloc(numBytes, 1);
if (!ptr)
{
ERROR("Unable to allocate memory (%lu bytes)\n", numBytes);
return TEST_FAIL;
}
}
return TEST_SUCCESS;
}
ErrCode PtrUnion::FreeGpuMem()
{
if (this->ptr != nullptr)
{
hipFree(this->ptr);
this->ptr = nullptr;
}
return TEST_SUCCESS;
}
ErrCode PtrUnion::FreeCpuMem()
{
if (this->ptr != nullptr)
{
free(this->ptr);
this->ptr = nullptr;
}
return TEST_SUCCESS;
}
ErrCode PtrUnion::ClearGpuMem(size_t const numBytes)
{
if (hipMemset(this->ptr, 0, numBytes) != hipSuccess)
{
ERROR("Unable to call hipMemset\n");
return TEST_FAIL;
}
return TEST_SUCCESS;
}
ErrCode PtrUnion::ClearCpuMem(size_t const numBytes)
{
memset(this->ptr, 0, numBytes);
return TEST_SUCCESS;
}
ErrCode PtrUnion::FillPattern(ncclDataType_t const dataType,
size_t const numElements,
int const globalRank,
bool const isGpuMem)
{
PtrUnion temp;
size_t const numBytes = numElements * DataTypeToBytes(dataType);
// If this is GPU memory, create a CPU temp buffer otherwise fill CPU memory directly
if (isGpuMem)
temp.AllocateCpuMem(numBytes);
else
temp.Attach(this->ptr);
for (int i = 0; i < numElements; i++)
{
int valueI = (globalRank + i) % 256;
double valueF = 1.0L/((double)valueI+1.0L);
temp.Set(dataType, i, valueI, valueF);
}
// If this is GPU memory, copy from CPU temp buffer
if (isGpuMem)
{
if (hipMemcpy(this->ptr, temp.ptr, numBytes, hipMemcpyHostToDevice) != hipSuccess)
{
ERROR("Unable to fill input with pattern for rank %d\n", globalRank);
return TEST_FAIL;
}
temp.FreeCpuMem();
}
return TEST_SUCCESS;
}
ErrCode PtrUnion::Set(ncclDataType_t const dataType, int const idx, int valueI, double valueF)
{
switch (dataType)
{
case ncclInt8: I1[idx] = valueI; break;
case ncclUint8: U1[idx] = valueI; break;
case ncclInt32: I4[idx] = valueI; break;
case ncclUint32: U4[idx] = valueI; break;
case ncclInt64: I8[idx] = valueI; break;
case ncclUint64: U8[idx] = valueI; break;
case ncclFloat32: F4[idx] = valueF; break;
case ncclFloat64: F8[idx] = valueF; break;
case ncclBfloat16: B2[idx] = rccl_bfloat16(static_cast<float>(valueF)); break;
default:
ERROR("Unsupported datatype\n");
return TEST_FAIL;
}
return TEST_SUCCESS;
}
ErrCode PtrUnion::Get(ncclDataType_t const dataType, int const idx, int& valueI, double& valueF) const
{
switch (dataType)
{
case ncclInt8: valueI = I1[idx]; break;
case ncclUint8: valueI = I1[idx]; break;
case ncclInt32: valueI = I4[idx]; break;
case ncclUint32: valueI = U4[idx]; break;
case ncclInt64: valueI = I8[idx]; break;
case ncclUint64: valueI = U8[idx]; break;
case ncclFloat32: valueF = F4[idx]; break;
case ncclFloat64: valueF = F8[idx]; break;
case ncclBfloat16: valueF = B2[idx]; break;
default:
ERROR("Unsupported datatype\n");
return TEST_FAIL;
}
return TEST_SUCCESS;
}
// Multiplies in-place each element by scalarsPerRank[rank]
ErrCode PtrUnion::Scale(ncclDataType_t const dataType,
size_t const numElements,
PtrUnion const& scalarsPerRank,
int const rank)
{
// If no scalars are provided do nothing
if (scalarsPerRank.ptr == nullptr) return TEST_SUCCESS;
for (size_t idx = 0; idx < numElements; ++idx)
{
switch (dataType)
{
case ncclInt8: I1[idx] *= scalarsPerRank.I1[rank]; break;
case ncclUint8: U1[idx] *= scalarsPerRank.U1[rank]; break;
case ncclInt32: I4[idx] *= scalarsPerRank.I4[rank]; break;
case ncclUint32: U4[idx] *= scalarsPerRank.U4[rank]; break;
case ncclInt64: I8[idx] *= scalarsPerRank.I8[rank]; break;
case ncclUint64: U8[idx] *= scalarsPerRank.U8[rank]; break;
case ncclFloat32: F4[idx] *= scalarsPerRank.F4[rank]; break;
case ncclFloat64: F8[idx] *= scalarsPerRank.F8[rank]; break;
case ncclBfloat16: B2[idx] *= scalarsPerRank.B2[rank]; break;
default:
ERROR("Unsupported datatype\n");
return TEST_FAIL;
}
}
return TEST_SUCCESS;
}
ErrCode PtrUnion::Reduce(ncclDataType_t const dataType,
size_t const numElements,
PtrUnion const& inputCpu,
ncclRedOp_t const op)
{
if (inputCpu.ptr == nullptr)
{
ERROR("Input pointer to Reduce should not be nullptr\n");
return TEST_FAIL;
}
for (size_t idx = 0; idx < numElements; ++idx)
{
switch (dataType)
{
case ncclInt8: I1[idx] = ReduceOp(op, I1[idx], inputCpu.I1[idx]); break;
case ncclUint8: U1[idx] = ReduceOp(op, U1[idx], inputCpu.U1[idx]); break;
case ncclInt32: I4[idx] = ReduceOp(op, I4[idx], inputCpu.I4[idx]); break;
case ncclUint32: U4[idx] = ReduceOp(op, U4[idx], inputCpu.U4[idx]); break;
case ncclInt64: I8[idx] = ReduceOp(op, I8[idx], inputCpu.I8[idx]); break;
case ncclUint64: U8[idx] = ReduceOp(op, U8[idx], inputCpu.U8[idx]); break;
case ncclFloat32: F4[idx] = ReduceOp(op, F4[idx], inputCpu.F4[idx]); break;
case ncclFloat64: F8[idx] = ReduceOp(op, F8[idx], inputCpu.F8[idx]); break;
case ncclBfloat16: B2[idx] = ReduceOp(op, B2[idx], inputCpu.B2[idx]); break;
default:
ERROR("Unsupported datatype\n");
return TEST_FAIL;
}
}
return TEST_SUCCESS;
}
ErrCode PtrUnion::DivideByInt(ncclDataType_t const dataType,
size_t const numElements,
int const divisor)
{
for (size_t idx = 0; idx < numElements; ++idx)
{
switch (dataType)
{
case ncclInt8: I1[idx] /= divisor; break;
case ncclUint8: U1[idx] /= divisor; break;
case ncclInt32: I4[idx] /= divisor; break;
case ncclUint32: U4[idx] /= divisor; break;
case ncclInt64: I8[idx] /= divisor; break;
case ncclUint64: U8[idx] /= divisor; break;
case ncclFloat32: F4[idx] /= divisor; break;
case ncclFloat64: F8[idx] /= divisor; break;
case ncclBfloat16: B2[idx] = (rccl_bfloat16((float)(B2[idx]) / divisor)); break;
default:
ERROR("Unsupported datatype\n");
return TEST_FAIL;
}
}
return TEST_SUCCESS;
}
ErrCode PtrUnion::IsEqual(ncclDataType_t const dataType,
size_t const numElements,
PtrUnion const& expected,
bool const verbose,
bool& isMatch)
{
isMatch = true;
size_t idx = 0;
for (idx = 0; idx < numElements; ++idx)
{
switch (dataType)
{
case ncclInt8: isMatch = (I1[idx] == expected.I1[idx]); break;
case ncclUint8: isMatch = (U1[idx] == expected.U1[idx]); break;
case ncclInt32: isMatch = (I4[idx] == expected.I4[idx]); break;
case ncclUint32: isMatch = (U4[idx] == expected.U4[idx]); break;
case ncclInt64: isMatch = (I8[idx] == expected.I8[idx]); break;
case ncclUint64: isMatch = (U8[idx] == expected.U8[idx]); break;
case ncclFloat32: isMatch = (fabs(F4[idx] - expected.F4[idx]) < 1e-5); break;
case ncclFloat64: isMatch = (fabs(F8[idx] - expected.F8[idx]) < 1e-12); break;
case ncclBfloat16: isMatch = (fabs((float)B2[idx] - (float)expected.B2[idx]) < 9e-2); break;
default:
ERROR("Unsupported datatype\n");
return TEST_FAIL;
}
if (!isMatch) break;
}
if (verbose && !isMatch)
{
switch (dataType)
{
case ncclInt8:
ERROR("Expected output: %d. Actual output: %d at index %lu\n", expected.I1[idx], I1[idx], idx); break;
case ncclUint8:
ERROR("Expected output: %u. Actual output: %u at index %lu\n", expected.U1[idx], U1[idx], idx); break;
case ncclInt32:
ERROR("Expected output: %d. Actual output: %d at index %lu\n", expected.I4[idx], I4[idx], idx); break;
case ncclUint32:
ERROR("Expected output: %u. Actual output: %u at index %lu\n", expected.U4[idx], U4[idx], idx); break;
case ncclInt64:
ERROR("Expected output: %ld. Actual output: %ld at index %lu\n", expected.I8[idx], I8[idx], idx); break;
case ncclUint64:
ERROR("Expected output: %lu. Actual output: %lu at index %lu\n", expected.U8[idx], U8[idx], idx); break;
case ncclFloat32:
ERROR("Expected output: %f. Actual output: %f at index %lu\n", expected.F4[idx], F4[idx], idx); break;
case ncclFloat64:
ERROR("Expected output: %lf. Actual output: %lf at index %lu\n", expected.F8[idx], F8[idx], idx); break;
case ncclBfloat16:
ERROR("Expected output: %f. Actual output: %f at index %lu\n", (float)expected.B2[idx], (float)B2[idx], idx); break;
default:
break;
}
}
return TEST_SUCCESS;
}
std::string PtrUnion::ToString(ncclDataType_t const dataType,
size_t const numElements) const
{
std::stringstream ss;
for (int i = 0; i < numElements; i++)
{
if (i) ss << " ";
switch (dataType)
{
case ncclInt8: ss << I1[i]; break;
case ncclUint8: ss << U1[i]; break;
case ncclInt32: ss << I4[i]; break;
case ncclUint32: ss << U4[i]; break;
case ncclInt64: ss << I8[i]; break;
case ncclUint64: ss << U8[i]; break;
case ncclFloat32: ss << F4[i]; break;
case ncclFloat64: ss << F8[i]; break;
case ncclBfloat16: ss << (float)B2[i]; break;
default: break;
}
}
return ss.str();
}
}
+90
Näytä tiedosto
@@ -0,0 +1,90 @@
#pragma once
#include "ErrCode.hpp"
#include "rccl.h"
#include "rccl_bfloat16.h"
namespace RcclUnitTesting
{
// Performs the various basic reduction operations
template <typename T>
T ReduceOp(ncclRedOp_t const op, T const A, T const B)
{
switch (op)
{
case ncclSum: return A + B;
case ncclProd: return A * B;
case ncclMax: return std::max(A, B);
case ncclMin: return std::min(A, B);
default:
ERROR("Unsupported reduction operator (%d)\n", op);
exit(0);
}
}
size_t DataTypeToBytes(ncclDataType_t const dataType);
// PtrUnion encapsulates a pointer of all the different supported datatypes
// NOTE: Currently half-precision float tests are unsupported due to half
// being supported on GPU only and not host
union PtrUnion
{
void* ptr;
int8_t* I1; // ncclInt8
uint8_t* U1; // ncclUint8
int32_t* I4; // ncclInt32
uint32_t* U4; // ncclUint32
int64_t* I8; // ncclInt64
uint64_t* U8; // ncclUint64
float* F4; // ncclFloat32
double* F8; // ncclFloat64
rccl_bfloat16* B2; // ncclBfloat16
ErrCode Attach(void *ptr);
ErrCode Attach(PtrUnion ptrUnion);
ErrCode AllocateGpuMem(size_t const numBytes, bool const useManagedMem = false);
ErrCode AllocateCpuMem(size_t const numBytes);
ErrCode FreeGpuMem();
ErrCode FreeCpuMem();
ErrCode ClearGpuMem(size_t const numBytes);
ErrCode ClearCpuMem(size_t const numBytes);
ErrCode FillPattern(ncclDataType_t const dataType,
size_t const numElements,
int const globalRank,
bool const isGpuMem);
ErrCode Set(ncclDataType_t const dataType, int const idx, int valueI, double valueF);
ErrCode Get(ncclDataType_t const dataType, int const idx, int& valueI, double& valueF) const;
// Multiplies in-place each element by scalarsPerRank[rank]
ErrCode Scale(ncclDataType_t const dataType,
size_t const numElements,
PtrUnion const& scalarsPerRank,
int const rank);
// Reduces input into this PtrUnion
ErrCode Reduce(ncclDataType_t const dataType,
size_t const numElements,
PtrUnion const& inputCpu,
ncclRedOp_t const op);
// Divide each element by a integer value
ErrCode DivideByInt(ncclDataType_t const dataType,
size_t const numElements,
int const divisor);
// Compares for equality (fuzzy comparision for floating point types)
ErrCode IsEqual(ncclDataType_t const dataType,
size_t const numElements,
PtrUnion const& expected,
bool const verbose,
bool& isMatch);
// Output to string (for debug)
std::string ToString(ncclDataType_t const dataType,
size_t const numElements) const;
};
}
+485
Näytä tiedosto
@@ -0,0 +1,485 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <unistd.h>
#include "TestBed.hpp"
#include <rccl.h>
#define PIPE_WRITE(childId, val) \
ASSERT_EQ(write(childList[childId]->parentWriteFd, &val, sizeof(val)), sizeof(val))
#define PIPE_CHECK(childId) \
{ \
int response = 0; \
ASSERT_EQ(read(childList[childId]->parentReadFd, &response, sizeof(int)), sizeof(int)); \
ASSERT_EQ(response, TEST_SUCCESS); \
}
namespace RcclUnitTesting
{
TestBed::TestBed() :
numDevicesAvailable(0),
numActiveChildren(0),
numActiveRanks(0)
{
// Set NCCL_COMM_ID to use a local port to avoid passing ncclCommId
// Calling ncclGetUniqueId would initialize HIP, which should not be done prior to fork
std::string localPort = "55513";
if (!getenv("NCCL_COMM_ID"))
{
char hostname[HOST_NAME_MAX+1];
gethostname(hostname, HOST_NAME_MAX+1);
std::string hostnameString(hostname);
hostnameString.append(":55513");
setenv("NCCL_COMM_ID", hostnameString.c_str(), 0);
if (ev.verbose) INFO("NCCL_COMM_ID set to %s\n", hostnameString.c_str());
}
// Collect the number of GPUs
this->numDevicesAvailable = ev.maxGpus;
if (ev.verbose) INFO("Detected %d GPUs\n", this->numDevicesAvailable);
// Create the maximum number of possible child processes (1 per GPU)
// Parent and child communicate via pipes
childList.resize(this->numDevicesAvailable);
for (int childId = 0; childId < this->numDevicesAvailable; ++childId)
{
childList[childId] = new TestBedChild(childId, ev.verbose, ev.printValues);
if (childList[childId]->InitPipes() != TEST_SUCCESS)
{
ERROR("Unable to create pipes to child process\n");
return;
}
pid_t pid = fork();
if (pid == 0)
{
// Child process enters execution loop
childList[childId]->StartExecutionLoop();
return;
}
else
{
// Parent records child process ID and closes unused ends of pipe
childList[childId]->pid = pid;
close(childList[childId]->childWriteFd);
close(childList[childId]->childReadFd);
}
}
}
void TestBed::InitComms(std::vector<std::vector<int>> const& deviceIdsPerProcess,
int const numCollectivesInGroup)
{
// Count up the total number of GPUs to use and track child/deviceId per rank
this->numActiveChildren = deviceIdsPerProcess.size();
this->numActiveRanks = 0;
this->numCollectivesInGroup = numCollectivesInGroup;
this->rankToChildMap.clear();
this->rankToDeviceMap.clear();
if (ev.verbose) INFO("Setting up %d active child processes\n", this->numActiveChildren);
for (int childId = 0; childId < this->numActiveChildren; ++childId)
{
for (auto i = 0; i < deviceIdsPerProcess[childId].size(); ++i)
{
this->rankToChildMap.push_back(childId);
this->rankToDeviceMap.push_back(deviceIdsPerProcess[childId][i]);
++this->numActiveRanks;
}
}
// Send InitComms command to each active child process
int const cmd = TestBedChild::CHILD_INIT_COMMS;
int rankOffset = 0;
for (int childId = 0; childId < this->numActiveChildren; ++childId)
{
PIPE_WRITE(childId, cmd);
// Send total number of ranks to child process
PIPE_WRITE(childId, this->numActiveRanks);
// Send the rank offset for this child process
PIPE_WRITE(childId, rankOffset);
// Send the number of collectives to be run per group call
PIPE_WRITE(childId, numCollectivesInGroup);
// Send the GPUs this child uses
int const numGpus = deviceIdsPerProcess[childId].size();
PIPE_WRITE(childId, numGpus);
for (int i = 0; i < numGpus; i++)
PIPE_WRITE(childId, deviceIdsPerProcess[childId][i]);
rankOffset += numGpus;
}
// Wait for child acknowledgement
// This is done after previous loop to avoid deadlock as every rank needs to enter ncclInitCommRank
for (int childId = 0; childId < this->numActiveChildren; ++childId)
{
PIPE_CHECK(childId);
}
}
void TestBed::InitComms(int const numGpus, int const numCollectivesInGroup)
{
InitComms(TestBed::GetDeviceIdsList(1, numGpus), numCollectivesInGroup);
}
void TestBed::SetCollectiveArgs(ncclFunc_t const funcType,
ncclDataType_t const dataType,
ncclRedOp_t const redOp,
int const root,
size_t const numInputElements,
size_t const numOutputElements,
int const collId,
int const rank,
PtrUnion const scalarsPerRank,
int const scalarMode)
{
// Build list of ranks this applies to (-1 for rank means to set for all)
std::vector<int> rankList;
for (int i = 0; i < this->numActiveRanks; ++i)
if (rank == -1 || rank == i) rankList.push_back(i);
ScalarTransport scalarTransport;
if (scalarMode >= 0)
{
ASSERT_TRUE(scalarsPerRank.ptr != NULL);
// Capture scalars per rank in format to share with child processes
int const numBytes = this->numActiveRanks * DataTypeToBytes(dataType);
memcpy(scalarTransport.ptr, scalarsPerRank.ptr, numBytes);
}
// Loop over all ranks and send CollectiveArgs to appropriate child process
int const cmd = TestBedChild::CHILD_SET_COLL_ARGS;
for (auto currRank : rankList)
{
int const childId = rankToChildMap[currRank];
PIPE_WRITE(childId, cmd);
PIPE_WRITE(childId, currRank);
PIPE_WRITE(childId, collId);
PIPE_WRITE(childId, funcType);
PIPE_WRITE(childId, dataType);
PIPE_WRITE(childId, redOp);
PIPE_WRITE(childId, root);
PIPE_WRITE(childId, numInputElements);
PIPE_WRITE(childId, numOutputElements);
PIPE_WRITE(childId, scalarMode);
PIPE_WRITE(childId, scalarTransport);
PIPE_CHECK(childId);
}
}
void TestBed::AllocateMem(bool const inPlace,
bool const useManagedMem,
int const collId,
int const rank)
{
// Build list of ranks this applies to (-1 for rank means to set for all)
std::vector<int> rankList;
for (int i = 0; i < this->numActiveRanks; ++i)
if (rank == -1 || rank == i) rankList.push_back(i);
// Loop over all ranks and send allocation command to appropriate child process
int const cmd = TestBedChild::CHILD_ALLOCATE_MEM;
for (auto currRank : rankList)
{
int const childId = rankToChildMap[currRank];
PIPE_WRITE(childId, cmd);
PIPE_WRITE(childId, currRank);
PIPE_WRITE(childId, collId);
PIPE_WRITE(childId, inPlace);
PIPE_WRITE(childId, useManagedMem);
PIPE_CHECK(childId);
}
}
void TestBed::PrepareData(int const collId,
int const rank,
CollFuncPtr const prepDataFunc)
{
// Build list of ranks this applies to (-1 for rank means to set for all)
std::vector<int> rankList;
for (int i = 0; i < this->numActiveRanks; ++i)
if (rank == -1 || rank == i) rankList.push_back(i);
// Loop over all ranks and send prepare data command to appropriate child process
int const cmd = TestBedChild::CHILD_PREPARE_DATA;
for (auto currRank : rankList)
{
int const childId = rankToChildMap[currRank];
PIPE_WRITE(childId, cmd);
PIPE_WRITE(childId, currRank);
PIPE_WRITE(childId, collId);
PIPE_WRITE(childId, prepDataFunc);
PIPE_CHECK(childId);
}
}
void TestBed::ExecuteCollectives()
{
int const cmd = TestBedChild::CHILD_EXECUTE_COLL;
++TestBed::NumTestsRun();
// Send ExecuteColl command to each active child process
for (int childId = 0; childId < this->numActiveChildren; ++childId)
{
PIPE_WRITE(childId, cmd);
}
// Wait for child acknowledgement
for (int childId = 0; childId < this->numActiveChildren; ++childId)
{
PIPE_CHECK(childId);
}
}
void TestBed::ValidateResults(bool& isCorrect, int const collId, int const rank)
{
// Build list of ranks this applies to (-1 for rank means to set for all)
std::vector<int> rankList;
for (int i = 0; i < this->numActiveRanks; ++i)
if (rank == -1 || rank == i) rankList.push_back(i);
int const cmd = TestBedChild::CHILD_VALIDATE_RESULTS;
isCorrect = true;
// Send ValidateResults command to each active child process
for (auto currRank : rankList)
{
int const childId = rankToChildMap[currRank];
PIPE_WRITE(childId, cmd);
PIPE_WRITE(childId, currRank);
PIPE_WRITE(childId, collId);
int response = 0;
ASSERT_EQ(read(childList[childId]->parentReadFd, &response, sizeof(int)), sizeof(int));
isCorrect &= (response == TEST_SUCCESS);
}
ASSERT_EQ(isCorrect, true) << "Output does not match expected";
}
void TestBed::DeallocateMem(int const collId, int const rank)
{
// Build list of ranks this applies to (-1 for rank means to set for all)
std::vector<int> rankList;
for (int i = 0; i < this->numActiveRanks; ++i)
if (rank == -1 || rank == i) rankList.push_back(i);
int const cmd = TestBedChild::CHILD_DEALLOCATE_MEM;
for (auto currRank : rankList)
{
int const childId = rankToChildMap[currRank];
PIPE_WRITE(childId, cmd);
PIPE_WRITE(childId, currRank);
PIPE_WRITE(childId, collId);
PIPE_CHECK(childId);
}
}
void TestBed::DestroyComms()
{
int const cmd = TestBedChild::CHILD_DESTROY_COMMS;
for (int childId = 0; childId < this->numActiveChildren; ++childId)
{
// Send DestroyComms command to each active child process
PIPE_WRITE(childId, cmd);
// Wait for child acknowledgement
PIPE_CHECK(childId);
}
// Reset bookkeeping
this->numActiveChildren = 0;
this->numActiveRanks = 0;
this->numCollectivesInGroup = 0;
}
void TestBed::Finalize()
{
// Send Stop to all child processes
int const cmd = TestBedChild::CHILD_STOP;
for (int childId = 0; childId < this->numDevicesAvailable; ++childId)
{
PIPE_WRITE(childId, cmd);
// Close pipes to child process
close(childList[childId]->parentWriteFd);
close(childList[childId]->parentReadFd);
}
this->numDevicesAvailable = 0;
}
TestBed::~TestBed()
{
Finalize();
}
std::vector<ncclRedOp_t> const& TestBed::GetAllSupportedRedOps()
{
return ev.GetAllSupportedRedOps();
}
std::vector<ncclDataType_t> const& TestBed::GetAllSupportedDataTypes()
{
return ev.GetAllSupportedDataTypes();
}
std::vector<std::vector<int>> TestBed::GetDeviceIdsList(int const numProcesses,
int const numGpus)
{
std::vector<std::vector<int>> result(numProcesses);
for (int i = 0; i < numGpus; i++)
result[i % numProcesses].push_back(i);
return result;
}
std::string TestBed::GetTestCaseName(int const totalRanks,
bool const isMultiProcess,
ncclFunc_t const funcType,
ncclDataType_t const dataType,
ncclRedOp_t const redOp,
int const root,
bool const inPlace,
bool const managedMem)
{
std::stringstream ss;
ss << (isMultiProcess ? "MP" : "SP") << " ";
ss << totalRanks << " ranks ";
ss << ncclFuncNames[funcType] << " ";
ss << "(" << (inPlace ? "IP" : "OP") << "," << (managedMem ? "MM" : "GM") << ") ";
ss << ncclDataTypeNames[dataType] << " ";
if (CollectiveArgs::UsesReduce(funcType)) ss << ncclRedOpNames[redOp] << " ";
if (CollectiveArgs::UsesRoot(funcType)) ss << "Root " << root << " ";
return ss.str();
}
void TestBed::RunSimpleSweep(std::vector<ncclFunc_t> const& funcTypes,
std::vector<ncclDataType_t> const& tmpDataTypes,
std::vector<ncclRedOp_t> const& tmpRedOps,
std::vector<int> const& roots,
std::vector<int> const& numElements,
std::vector<bool> const& inPlaceList,
std::vector<bool> const& managedMemList)
{
// Sort numElements in descending order to cut down on # of allocations
std::vector<int> sortedN = numElements;
std::sort(sortedN.rbegin(), sortedN.rend());
// Filter out any unsupported datatypes, in case only subset has been compiled for
std::vector<ncclDataType_t> const& supportedDataTypes = this->GetAllSupportedDataTypes();
std::vector<ncclDataType_t> dataTypes;
for (auto dt : tmpDataTypes)
{
for (int i = 0; i < supportedDataTypes.size(); ++i)
{
if (supportedDataTypes[i] == dt)
{
dataTypes.push_back(dt);
break;
}
}
}
// Filter out any unsupported reduction ops, in case only subset has been compiled for
std::vector<ncclRedOp_t> const& supportedOps = this->GetAllSupportedRedOps();
std::vector<ncclRedOp_t> redOps;
for (auto redop : tmpRedOps)
{
for (int i = 0; i < supportedOps.size(); ++i)
{
if (supportedOps[i] == redop)
{
redOps.push_back(redop);
break;
}
}
}
bool isCorrect = true;
// Sweep over the number of ranks
for (int totalRanks = ev.minGpus; totalRanks <= ev.maxGpus && isCorrect; ++totalRanks)
for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
{
if (!(ev.processMask & (1 << isMultiProcess))) continue;
// Test either single process all GPUs, or 1 process per GPU
int const numProcesses = isMultiProcess ? totalRanks : 1;
this->InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
for (int ftIdx = 0; ftIdx < funcTypes.size() && isCorrect; ++ftIdx)
for (int dtIdx = 0; dtIdx < dataTypes.size() && isCorrect; ++dtIdx)
for (int rdIdx = 0; rdIdx < redOps.size() && isCorrect; ++rdIdx)
for (int rtIdx = 0; rtIdx < roots.size() && isCorrect; ++rtIdx)
for (int ipIdx = 0; ipIdx < inPlaceList.size() && isCorrect; ++ipIdx)
for (int mmIdx = 0; mmIdx < managedMemList.size() && isCorrect; ++mmIdx)
{
if (ev.showNames)
{
std::string name = this->GetTestCaseName(totalRanks, isMultiProcess,
funcTypes[ftIdx], dataTypes[dtIdx],
redOps[rdIdx], roots[rtIdx],
inPlaceList[ipIdx], managedMemList[mmIdx]);
INFO("%s\n", name.c_str());
}
for (int neIdx = 0; neIdx < numElements.size() && isCorrect; ++neIdx)
{
int numInputElements, numOutputElements;
CollectiveArgs::GetNumElementsForFuncType(funcTypes[ftIdx],
sortedN[neIdx],
totalRanks,
&numInputElements,
&numOutputElements);
this->SetCollectiveArgs(funcTypes[ftIdx],
dataTypes[dtIdx],
redOps[rdIdx],
roots[rtIdx],
numInputElements,
numOutputElements);
// Only allocate once for largest size
if (neIdx == 0) this->AllocateMem(inPlaceList[ipIdx], managedMemList[mmIdx]);
// There are some cases when data does not need to be re-prepared
// e.g. AllReduce subarray expected results are still valid
bool canSkip = (neIdx != 0 && !inPlaceList[ipIdx] &&
(funcTypes[ftIdx] == ncclCollBroadcast ||
funcTypes[ftIdx] == ncclCollReduce ||
funcTypes[ftIdx] == ncclCollAllReduce));
if (!canSkip) this->PrepareData();
this->ExecuteCollectives();
this->ValidateResults(isCorrect);
if (!isCorrect)
{
std::string name = this->GetTestCaseName(totalRanks, isMultiProcess,
funcTypes[ftIdx], dataTypes[dtIdx],
redOps[rdIdx], roots[rtIdx],
inPlaceList[ipIdx], managedMemList[mmIdx]);
ERROR("Incorrect output for %s\n", name.c_str());
}
}
this->DeallocateMem();
}
this->DestroyComms();
}
}
int& TestBed::NumTestsRun()
{
static int numTestsRun = 0;
return numTestsRun;
}
}
#undef PIPE_WRITE
#undef PIPE_CHECK
+129
Näytä tiedosto
@@ -0,0 +1,129 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#pragma once
#include <map>
#include "CollectiveArgs.hpp"
#include "TestBedChild.hpp"
#include "EnvVars.hpp"
#include <gtest/gtest.h>
namespace RcclUnitTesting
{
// This class facilitates testing RCCL collectives across various process / device configurations
//
class TestBed
{
public:
int numDevicesAvailable; // # of devices detected on node
std::vector<TestBedChild*> childList; // List of child processes
std::vector<int> rankToChildMap; // Tracks which child process each rank is assigned to
std::vector<int> rankToDeviceMap; // Tracks which device each rank is assigned to
int numActiveChildren; // List of active children (with usable RCCL comms)
int numActiveRanks; // Current # of ranks in use
int numCollectivesInGroup; // # of collectives to execute per group call
EnvVars ev; // Environment variables
// Constructor - Creates one child process per detected GPU device that waits for further commands
TestBed();
// Prepare TestBed for use with GPUs across multiple child processes
void InitComms(std::vector<std::vector<int>> const& deviceIdsPerChild, int const numCollectivesInGroup = 1);
// Prepare TestBed for use with GPUs on a single child process
void InitComms(int const numGpus, int const numCollectivesInGroup = 1);
// Set collectives arguments for specified collective / rank
// Setting scalarsPerRank to non-null will create custom reduction operator
// Using collId = -1 (default) applies settings to all collectives in group
// Using rank = -1 (default) applies settings to all ranks
void SetCollectiveArgs(ncclFunc_t const funcType,
ncclDataType_t const dataType,
ncclRedOp_t const redOp,
int const root,
size_t const numInputElements,
size_t const numOutputElements,
int const collId = -1,
int const rank = -1,
PtrUnion const scalarsPerRank = {nullptr},
int const scalarMode = -1);
// Allocate memory for specified collective / rank
// - Requires SetCollectiveArgs to have been called already
// Using collId = -1 (default) applies settings to all collectives in group
// Using rank = -1 (default) applies settings to all ranks
void AllocateMem(bool const inPlace = false,
bool const useManagedMem = false,
int const collId = -1,
int const rank = -1);
// Initialize input and compute expected results
// - requires that SetCollectiveArgs and AllocateMemory have already been called
// Setting collId to -1 applies settings to all collectives in group
// Setting rank to -1 applies settings to all ranks
// Setting prepDataFunc to nullptr uses the default fill pattern routine
void PrepareData(int const collId = -1,
int const rank = -1,
CollFuncPtr const prepDataFunc = nullptr);
// Execute all collectives on all test children
// Blocks until collective is completed
void ExecuteCollectives();
// Perform results validation - compare output to expected
void ValidateResults(bool& isCorrect, int collId = -1, int const rank = -1);
// Release allocated memory
void DeallocateMem(int collId = -1, int const rank = -1);
// Release the RCCL comms
void DestroyComms();
// Explicit TestBed destructor that releases all child processes
// No further calls to TestBed should be performed after this call
void Finalize();
// Destructor - Calls Finalize() to release all child processes
~TestBed();
// Returns all the supported reduction operations based on build settings
std::vector<ncclRedOp_t> const& GetAllSupportedRedOps();
// Return all the supported data types based on build settings
std::vector<ncclDataType_t> const& GetAllSupportedDataTypes();
// Helper function that splits up GPUs to the given number of processes
static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
int const numGpus);
// Generate a test case name
static std::string GetTestCaseName(int const totalRanks,
bool const isMultiProcess,
ncclFunc_t const funcType,
ncclDataType_t const dataType,
ncclRedOp_t const redOp,
int const root,
bool const inPlace,
bool const managedMem);
// Run a simple sweep
void RunSimpleSweep(std::vector<ncclFunc_t> const& funcTypes,
std::vector<ncclDataType_t> const& dataTypes,
std::vector<ncclRedOp_t> const& redOps,
std::vector<int> const& roots,
std::vector<int> const& numElements,
std::vector<bool> const& inPlaceList,
std::vector<bool> const& managedMemList);
// Used to track total number of calls to ExecuteCollectives()
static int& NumTestsRun();
protected:
// Ends the specified child process
void StopChild(int const childId);
};
}
+589
Näytä tiedosto
@@ -0,0 +1,589 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "TestBedChild.hpp"
#include <thread>
#define CHILD_NCCL_CALL(cmd, msg) \
{ \
if (this->verbose) printf("[ NCCL CALL] " #cmd "\n"); \
ncclResult_t status = cmd; \
if (status != ncclSuccess) \
{ \
ERROR("Child process %d fails NCCL call %s with code %d\n", this->childId, msg, status); \
return TEST_FAIL; \
} \
}
#define PIPE_READ(val) \
if (read(childReadFd, &val, sizeof(val)) != sizeof(val)) return TEST_FAIL;
namespace RcclUnitTesting
{
TestBedChild::TestBedChild(int const childId, bool const verbose, int const printValues)
{
this->childId = childId;
this->verbose = verbose;
this->printValues = printValues;
}
int TestBedChild::InitPipes()
{
// Prepare parent->child pipe
int pipefd[2];
if (pipe(pipefd) == -1)
{
ERROR("Unable to create parent->child pipe for child %d\n", this->childId);
return TEST_FAIL;
}
this->childReadFd = pipefd[0];
this->parentWriteFd = pipefd[1];
// Prepare child->parent pipe
this->parentReadFd = -1;
if (pipe(pipefd) == -1)
{
ERROR("Unable to create parent->child pipe for child %d\n", this->childId);
return TEST_FAIL;
}
this->parentReadFd = pipefd[0];
this->childWriteFd = pipefd[1];
return TEST_SUCCESS;
}
void TestBedChild::StartExecutionLoop()
{
// Close unused ends of pipes
close(this->parentWriteFd);
close(this->parentReadFd);
// Wait for commands from parent process
if (verbose) INFO("Child %d enters execution loop\n", this->childId);
int command;
while (read(childReadFd, &command, sizeof(command)) > 0)
{
if (verbose) INFO("Child %d received command [%s]:\n", this->childId, ChildCommandNames[command]);;
ErrCode status = TEST_SUCCESS;
switch(command)
{
case CHILD_INIT_COMMS : status = InitComms(); break;
case CHILD_SET_COLL_ARGS : status = SetCollectiveArgs(); break;
case CHILD_ALLOCATE_MEM : status = AllocateMem(); break;
case CHILD_PREPARE_DATA : status = PrepareData(); break;
case CHILD_EXECUTE_COLL : status = ExecuteCollectives(); break;
case CHILD_VALIDATE_RESULTS: status = ValidateResults(); break;
case CHILD_DEALLOCATE_MEM : status = DeallocateMem(); break;
case CHILD_DESTROY_COMMS : status = DestroyComms(); break;
case CHILD_STOP : status = Stop(); break;
default: exit(0);
}
// Send back acknowledgement to parent
if (status == TEST_FAIL)
ERROR("Child %d failed on command [%s]:\n", this->childId, ChildCommandNames[command]);
write(childWriteFd, &status, sizeof(status));
}
// Close child ends of pipe
close(this->childReadFd);
close(this->childWriteFd);
exit(0);
}
ErrCode TestBedChild::InitComms()
{
if (this->verbose) INFO("Child %d begins InitComms()\n", this->childId);
// Read values sent by parent [see TestBed::InitComms()]
PIPE_READ(this->totalRanks);
PIPE_READ(this->rankOffset);
PIPE_READ(this->numCollectivesInGroup);
// Read the GPUs this child uses and prepare storage for collective args / datasets
int numGpus;
PIPE_READ(numGpus);
this->deviceIds.resize(numGpus);
this->streams.resize(numGpus);
this->collArgs.resize(numGpus);
for (int i = 0; i < numGpus; i++)
{
PIPE_READ(this->deviceIds[i]);
this->collArgs[i].clear();
this->collArgs[i].resize(numCollectivesInGroup);
}
// Collect uniqueId (specified by NCCL_COMM_ID env var)
ncclUniqueId id;
CHILD_NCCL_CALL(ncclGetUniqueId(&id), "ncclGetUniqueId");
// Initialize communicators
comms.clear();
comms.resize(numGpus);
// Initialize within a group call to avoid deadlock when using multiple ranks per child
ErrCode status = TEST_SUCCESS;
CHILD_NCCL_CALL(ncclGroupStart(), "ncclGroupStart");
for (int localRank = 0; localRank < numGpus; ++localRank)
{
int const globalRank = this->rankOffset + localRank;
int const currGpu = this->deviceIds[localRank];
if (hipSetDevice(currGpu) != hipSuccess)
{
ERROR("Rank %d on child %d unable to switch to GPU %d\n", globalRank, this->childId, currGpu);
status = TEST_FAIL;
break;
}
if (hipStreamCreate(&this->streams[localRank]) != hipSuccess)
{
ERROR("Rank %d on child %d unable to create stream for GPU %d\n", globalRank, this->childId, currGpu);
status = TEST_FAIL;
break;
}
if (ncclCommInitRank(&this->comms[localRank], this->totalRanks, id, globalRank) != ncclSuccess)
{
ERROR("Rank %d on child %d unable to call ncclCommInitRank\n", globalRank, this->childId);
status = TEST_FAIL;
break;
}
}
if (status == TEST_SUCCESS)
{
CHILD_NCCL_CALL(ncclGroupEnd(), "ncclGroupStart");
}
if (this->verbose) INFO("Child %d finishes InitComms() [%s]\n",
this->childId, status == TEST_SUCCESS ? "SUCCESS" : "FAIL");
return status;
}
ErrCode TestBedChild::SetCollectiveArgs()
{
if (this->verbose) INFO("Child %d begins SetCollectiveArgs()\n", this->childId);
// Read values sent by parent [see TestBed::SetCollectiveArgs()]
int globalRank;
int collId;
ncclFunc_t funcType;
ncclDataType_t dataType;
ncclRedOp_t redOp;
int root;
size_t numInputElements;
size_t numOutputElements;
ScalarTransport scalarTransport;
int scalarMode;
PIPE_READ(globalRank);
PIPE_READ(collId);
PIPE_READ(funcType);
PIPE_READ(dataType);
PIPE_READ(redOp);
PIPE_READ(root);
PIPE_READ(numInputElements);
PIPE_READ(numOutputElements);
PIPE_READ(scalarMode);
PIPE_READ(scalarTransport);
for (int i = 0; i < this->totalRanks; i++)
{
PtrUnion scalarsPerRank;
scalarsPerRank.Attach(scalarTransport.ptr);
}
if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
{
ERROR("Child %d does not contain rank %d\n", this->childId, globalRank);
return TEST_FAIL;
}
int const localRank = globalRank - rankOffset;
CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx)
{
if (collId == -1 || collId == collIdx)
{
CollectiveArgs& collArg = this->collArgs[localRank][collIdx];
CHECK_CALL(collArg.SetArgs(globalRank, this->totalRanks,
this->deviceIds[localRank],
funcType, dataType, redOp, root,
numInputElements, numOutputElements,
scalarTransport, scalarMode));
if (this->verbose) INFO("Rank %d on child %d sets collective %d [%s]\n",
globalRank, this->childId, collIdx,
collArg.GetDescription().c_str());
// If pre-mult scalars are provided, then create a custom reduction operator
if (scalarMode >= 0)
{
CHILD_NCCL_CALL(ncclRedOpCreatePreMulSum(&collArg.redOp,
collArg.localScalar.ptr,
dataType,
(ncclScalarResidence_t)scalarMode,
this->comms[localRank]),
"ncclRedOpCreatePreMulSum");
if (verbose) INFO("Child %d created custom redop %d for collective %d\n",
this->childId, collArg.redOp, collIdx);
}
}
}
if (this->verbose) INFO("Child %d finishes SetCollectiveArgs()\n", this->childId);
return TEST_SUCCESS;
}
ErrCode TestBedChild::AllocateMem()
{
if (this->verbose) INFO("Child %d begins AllocateMem()\n", this->childId);
// Read values sent by parent [see TestBed::AllocateMem()]
int globalRank;
int collId;
bool inPlace;
bool useManagedMem;
PIPE_READ(globalRank);
PIPE_READ(collId);
PIPE_READ(inPlace);
PIPE_READ(useManagedMem);
if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
{
ERROR("Child %d does not contain rank %d\n", this->childId, globalRank);
return TEST_FAIL;
}
int const localRank = globalRank - rankOffset;
CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx)
{
if (collId == -1 || collId == collIdx)
{
CollectiveArgs& collArg = this->collArgs[localRank][collIdx];
CHECK_CALL(collArg.AllocateMem(inPlace, useManagedMem));
if (this->verbose) INFO("Rank %d on child %d allocates memory for collective %d on device %d (%s,%s) Input: %p Output %p\n",
globalRank, this->childId, collIdx, this->deviceIds[localRank],
inPlace ? "in-place" : "out-of-place",
useManagedMem ? "managed" : "unmanaged",
collArg.inputGpu.ptr,
collArg.outputGpu.ptr);
}
}
if (this->verbose) INFO("Child %d finishes AllocateMem()\n", this->childId);
return TEST_SUCCESS;
}
// Fill input memory with pre-known patterned based on rank
ErrCode TestBedChild::PrepareData()
{
if (this->verbose) INFO("Child %d begins PrepareData()\n", this->childId);
// Read values sent by parent [see TestBed::PrepareData()]
int globalRank;
int collId;
CollFuncPtr prepDataFunc;
PIPE_READ(globalRank);
PIPE_READ(collId);
PIPE_READ(prepDataFunc);
if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
{
ERROR("Child %d does not contain rank %d\n", this->childId, globalRank);
return TEST_FAIL;
}
int const localRank = globalRank - rankOffset;
CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx)
{
if (collId == -1 || collId == collIdx)
{
if (this->verbose) INFO("Rank %d on child %d prepares data for collective %d\n",
globalRank, this->childId, collIdx);
CHECK_CALL(this->collArgs[localRank][collIdx].PrepareData(prepDataFunc));
}
}
if (this->verbose) INFO("Child %d finishes PrepareData()\n", this->childId);
return TEST_SUCCESS;
}
ErrCode TestBedChild::ExecuteCollectives()
{
if (this->verbose) INFO("Child %d begins ExecuteCollectives()\n", this->childId);
// Start group call
CHILD_NCCL_CALL(ncclGroupStart(), "ncclGroupStart");
// Loop over all collectives to be executed in group call
for (int collId = 0; collId < this->numCollectivesInGroup; ++collId)
{
// Loop over all local ranks
for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank)
{
CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
CollectiveArgs const& collArg = this->collArgs[localRank][collId];
if (this->printValues)
{
int const numInputElementsToPrint = (this->printValues < 0 ? collArg.numInputElements : this->printValues);
PtrUnion inputCpu;
size_t const numInputBytes = numInputElementsToPrint * DataTypeToBytes(collArg.dataType);
inputCpu.AllocateCpuMem(numInputBytes);
CHECK_HIP(hipMemcpy(inputCpu.ptr, collArg.inputGpu.ptr, numInputBytes, hipMemcpyDeviceToHost));
printf("[ DEBUG ] Rank %02d Coll %d %-10s: %s\n", collArg.globalRank, collId, "Input",
inputCpu.ToString(collArg.dataType, numInputElementsToPrint).c_str());
inputCpu.FreeCpuMem();
int const numOutputElementsToPrint = (this->printValues < 0 ? collArg.numOutputElements : this->printValues);
size_t const numOutputBytes = numOutputElementsToPrint * DataTypeToBytes(collArg.dataType);
CHECK_HIP(hipMemcpy(collArg.outputCpu.ptr, collArg.outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost));
printf("[ DEBUG ] Rank %02d Coll %d %-10s: %s\n", collArg.globalRank, collId, "Pre-Output",
collArg.outputCpu.ToString(collArg.dataType, numOutputElementsToPrint).c_str());
}
switch (collArg.funcType)
{
case ncclCollBroadcast:
CHILD_NCCL_CALL(ncclBroadcast(collArg.inputGpu.ptr,
collArg.outputGpu.ptr,
collArg.numInputElements,
collArg.dataType,
collArg.root,
this->comms[localRank],
this->streams[localRank]),
"ncclBroadcast");
break;
case ncclCollReduce:
CHILD_NCCL_CALL(ncclReduce(collArg.inputGpu.ptr,
collArg.outputGpu.ptr,
collArg.numInputElements,
collArg.dataType,
collArg.redOp,
collArg.root,
this->comms[localRank],
this->streams[localRank]),
"ncclReduce");
break;
case ncclCollAllGather:
CHILD_NCCL_CALL(ncclAllGather(collArg.inputGpu.ptr,
collArg.outputGpu.ptr,
collArg.numInputElements,
collArg.dataType,
this->comms[localRank],
this->streams[localRank]),
"ncclAllGather");
break;
case ncclCollReduceScatter:
CHILD_NCCL_CALL(ncclReduceScatter(collArg.inputGpu.ptr,
collArg.outputGpu.ptr,
collArg.numOutputElements,
collArg.dataType,
collArg.redOp,
this->comms[localRank],
this->streams[localRank]),
"ncclReduceScatter");
break;
case ncclCollAllReduce:
CHILD_NCCL_CALL(ncclAllReduce(collArg.inputGpu.ptr,
collArg.outputGpu.ptr,
collArg.numInputElements,
collArg.dataType,
collArg.redOp,
this->comms[localRank],
this->streams[localRank]),
"ncclAllReduce");
break;
case ncclCollGather:
CHILD_NCCL_CALL(ncclGather(collArg.inputGpu.ptr,
collArg.outputGpu.ptr,
collArg.numInputElements,
collArg.dataType,
collArg.root,
this->comms[localRank],
this->streams[localRank]),
"ncclGather");
break;
case ncclCollScatter:
CHILD_NCCL_CALL(ncclScatter(collArg.inputGpu.ptr,
collArg.outputGpu.ptr,
collArg.numOutputElements,
collArg.dataType,
collArg.root,
this->comms[localRank],
this->streams[localRank]),
"ncclScatter");
break;
case ncclCollAllToAll:
CHILD_NCCL_CALL(ncclAllToAll(collArg.inputGpu.ptr,
collArg.outputGpu.ptr,
collArg.numInputElements / collArg.totalRanks,
collArg.dataType,
this->comms[localRank],
this->streams[localRank]),
"ncclAllToAll");
break;
case ncclCollSend:
CHILD_NCCL_CALL(ncclSend(collArg.inputGpu.ptr,
collArg.numInputElements,
collArg.dataType,
collArg.root,
this->comms[localRank],
this->streams[localRank]),
"ncclSend");
break;
case ncclCollRecv:
CHILD_NCCL_CALL(ncclRecv(collArg.outputGpu.ptr,
collArg.numOutputElements,
collArg.dataType,
collArg.root,
this->comms[localRank],
this->streams[localRank]),
"ncclRecv");
break;
default:
ERROR("Unknown func type %d\n", collArg.funcType);
return TEST_FAIL;
}
}
}
// End group call
CHILD_NCCL_CALL(ncclGroupEnd(), "ncclGroupEnd");
// Synchronize
if (this->verbose) INFO("Child %d submits group call. Waiting for completion\n", this->childId);
for (int localRank = 0; localRank < this->streams.size(); ++localRank)
{
CHECK_HIP(hipStreamSynchronize(this->streams[localRank]));
}
if (this->printValues)
{
for (int collId = 0; collId < this->numCollectivesInGroup; ++collId)
for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank)
{
CollectiveArgs const& collArg = this->collArgs[localRank][collId];
int numOutputElementsToPrint = (this->printValues < 0 ? collArg.numOutputElements : this->printValues);
size_t const numOutputBytes = numOutputElementsToPrint * DataTypeToBytes(collArg.dataType);
CHECK_HIP(hipMemcpy(collArg.outputCpu.ptr, collArg.outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost));
printf("[ DEBUG ] Rank %02d Coll %d %-10s: %s\n", collArg.globalRank, collId, "Output",
collArg.outputCpu.ToString(collArg.dataType, numOutputElementsToPrint).c_str());
printf("[ DEBUG ] Rank %02d Coll %d %-10s: %s\n", collArg.globalRank, collId, "Expected",
collArg.expected.ToString(collArg.dataType, numOutputElementsToPrint).c_str());
}
}
if (this->verbose) INFO("Child %d finishes ExecuteCollectives()\n", this->childId);
return TEST_SUCCESS;
}
ErrCode TestBedChild::ValidateResults()
{
// Read values sent by parent [see TestBed::ValidateResults()]
int globalRank, collId;
PIPE_READ(globalRank);
PIPE_READ(collId);
if (this->verbose) INFO("Child %d begins ValidateResults()\n", this->childId);
if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
{
ERROR("Child %d does not contain rank %d\n", this->childId, globalRank);
return TEST_FAIL;
}
int const localRank = globalRank - rankOffset;
CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
ErrCode status = TEST_SUCCESS;
for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx)
{
if (collId == -1 || collId == collIdx)
{
if (this->verbose) INFO("Rank %d on child %d validating collective %d results\n",
globalRank, this->childId, collIdx);
if (this->collArgs[localRank][collIdx].ValidateResults() != TEST_SUCCESS)
{
ERROR("Rank %d Collective %d output does not match expected\n", globalRank, collIdx);
status = TEST_FAIL;
}
}
}
if (this->verbose) INFO("Child %d finishes ValidateResults() with status %s\n", this->childId,
status == TEST_SUCCESS ? "SUCCESS" : "FAIL");
return status;
}
ErrCode TestBedChild::DeallocateMem()
{
if (this->verbose) INFO("Child %d begins DeallocateMem\n", this->childId);
// Read values sent by parent [see TestBed::DeallocateMem()]
int globalRank, collId;
PIPE_READ(globalRank);
PIPE_READ(collId);
if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
{
ERROR("Child %d does not contain rank %d\n", this->childId, globalRank);
return TEST_FAIL;
}
int const localRank = globalRank - rankOffset;
CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx)
{
CollectiveArgs& collArg = this->collArgs[localRank][collIdx];
if (collId == -1 || collId == collIdx)
{
if (this->verbose)
{
INFO("Child %d release memory for collective %d (Input: %p Output %p\n",
this->childId, collIdx, collArg.inputGpu.ptr, collArg.outputGpu.ptr);
}
CHECK_CALL(collArg.DeallocateMem());
}
if (collArg.scalarMode != -1)
{
CHILD_NCCL_CALL(ncclRedOpDestroy(collArg.redOp, this->comms[localRank]),
"ncclRedOpDestroy");
if (verbose) INFO("Child %d destroys custom redop %d for collective %d\n",
this->childId, collArg.redOp, collIdx);
}
}
if (this->verbose) INFO("Child %d finishes DeallocateMem\n", this->childId);
return TEST_SUCCESS;
}
ErrCode TestBedChild::DestroyComms()
{
if (this->verbose) INFO("Child %d begins DestroyComms\n", this->childId);
// Release comms
for (int i = 0; i < this->comms.size(); ++i)
{
CHILD_NCCL_CALL(ncclCommDestroy(this->comms[i]), "ncclCommDestroy");
}
for (int i = 0; i < this->streams.size(); ++i)
{
CHECK_HIP(hipStreamDestroy(this->streams[i]));
}
this->comms.clear();
this->streams.clear();
if (this->verbose) INFO("Child %d finishes DestroyComms\n", this->childId);
return TEST_SUCCESS;
}
ErrCode TestBedChild::Stop()
{
return TEST_SUCCESS;
}
}
+106
Näytä tiedosto
@@ -0,0 +1,106 @@
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#pragma once
#include <vector>
#include <unistd.h>
#include "CollectiveArgs.hpp"
#include "rccl.h"
#define MAX_RANKS 32
namespace RcclUnitTesting
{
class TestBedChild
{
public:
// These are commands that can be given to the child process
enum
{
CHILD_INIT_COMMS = 0, // InitComms()
CHILD_SET_COLL_ARGS = 1, // SetCollectiveArgs()
CHILD_ALLOCATE_MEM = 2, // AllocateMem()
CHILD_PREPARE_DATA = 3, // PrepareData()
CHILD_EXECUTE_COLL = 4, // ExecuteCollectives()
CHILD_VALIDATE_RESULTS = 5, // ValidateResults()
CHILD_DEALLOCATE_MEM = 6, // DeallocateMem()
CHILD_DESTROY_COMMS = 7, // DestroyComms()
CHILD_STOP = 8, // Stop()
NUM_CHILD_COMMANDS = 9
};
char const ChildCommandNames[NUM_CHILD_COMMANDS][20] =
{
"INIT_COMMS",
"SET_COLL_ARGS",
"ALLOCATE_MEM",
"PREPARE_DATA",
"EXECUTE_COLL",
"VALIDATE_RESULTS",
"DEALLOCATE_MEM",
"DESTROY_COMMS",
"STOP"
};
// These variables remain constant for life of TestBedChild
int childId;
pid_t pid;
bool verbose;
int printValues;
// Pipes used to communicate between parent process
int parentWriteFd;
int parentReadFd;
int childWriteFd;
int childReadFd;
// These varibles may change based on commands issued by parent
int totalRanks; // Total ranks
int rankOffset; // Global rank offset for this child
int numCollectivesInGroup; // # of collectives to run per group call
std::vector<ncclComm_t> comms; // RCCL communicators for each rank
std::vector<int> deviceIds; // Device IDs for each rank
std::vector<hipStream_t> streams; // Streams for executing collectives
std::vector<std::vector<CollectiveArgs>> collArgs; // Info for each collective for each rank
// Constructor
TestBedChild(int const childId, bool const verbose, int const printValues);
// Prepare parent/child communication pipes - to be executed by parent process
int InitPipes();
// Execution
void StartExecutionLoop();
protected:
// Initialize RCCL communicators
ErrCode InitComms();
// Set CollectiveArgs
ErrCode SetCollectiveArgs();
// Allocate memory (input (GPU) / output (GPU) / expected (CPU))
ErrCode AllocateMem();
// Prepare input and expected data
ErrCode PrepareData();
// Execute a group of collectives
ErrCode ExecuteCollectives();
// Validate that output matches expected
ErrCode ValidateResults();
// Release allocated memory
ErrCode DeallocateMem();
// Destroys RCCL communicators
ErrCode DestroyComms();
// Stops this child process
ErrCode Stop();
};
}
+11
Näytä tiedosto
@@ -0,0 +1,11 @@
#include <gtest/gtest.h>
#include "EnvVars.hpp"
#include "TestBed.hpp"
int main(int argc, char **argv)
{
::testing::InitGoogleTest(&argc, argv);
RcclUnitTesting::EnvVars::ShowConfig();
int retCode = RUN_ALL_TESTS();
printf("[ INFO ] Total executed cases: %d\n", RcclUnitTesting::TestBed::NumTestsRun());
return retCode;
}
-117
Näytä tiedosto
@@ -1,117 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_AllGather.hpp"
namespace CorrectnessTests
{
TEST_P(AllGatherCorrectnessTest, Correctness)
{
// Adjust numElements to be multiple of numDevices
numElements = (numElements/numDevices)*numDevices;
if (numDevices > numDevicesAvailable) return;
if (numElements % numDevices != 0) return;
// Prepare input / output / expected results
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllGather);
FillDatasetWithPattern(dataset);
ComputeExpectedResults(dataset);
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
size_t const sendCount = dataset.numElements / dataset.numDevices;
// Launch the reduction (1 thread per GPU)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclAllGather((int8_t *)dataset.inputs[i] + (i * byteCount),
dataset.outputs[i], sendCount,
dataType, comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(dataset);
dataset.Release();
}
TEST_P(AllGatherCorrectnessTest, Alignment)
{
if (numDevices > numDevicesAvailable) return;
if (numElements % numDevices != 0) return;
// Allocate dataset
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllGather);
// Loop over several offsets (so that device pointers are not aligned)
for (int firstElement = 1; firstElement <= 11; firstElement += 2)
{
if (firstElement < numElements)
{
// Select last element so that total number of elements is multiple of numDevices
int const lastElement = firstElement + ((numElements - firstElement) / numDevices) * numDevices - 1;
if (lastElement >= numElements) break;
Dataset subDataset;
dataset.ExtractSubDataset(firstElement, lastElement, subDataset);
// Compute reference results for sub-dataset
FillDatasetWithPattern(subDataset);
ComputeExpectedResults(subDataset);
size_t const byteCount = subDataset.NumBytes() / subDataset.numDevices;
size_t const sendCount = subDataset.numElements / subDataset.numDevices;
// Launch the reduction (1 thread per GPU)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclAllGather((int8_t *)subDataset.inputs[i] + (i * byteCount),
subDataset.outputs[i], sendCount,
dataType, comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(subDataset);
}
}
dataset.Release();
}
INSTANTIATE_TEST_SUITE_P(AllGatherCorrectnessSweep,
AllGatherCorrectnessTest,
testing::Combine(
// Reduction operator (not used)
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(2520, 3026520),
// Number of devices
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-34
Näytä tiedosto
@@ -1,34 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_ALLGATHER_HPP
#define TEST_ALLGATHER_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class AllGatherCorrectnessTest : public CorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset)
{
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
int8_t* result = (int8_t *)malloc(dataset.NumBytes());
for (int i = 0; i < dataset.numDevices; i++)
HIP_CALL(hipMemcpy(result + i * byteCount, (int8_t *)dataset.inputs[i] + (i * byteCount),
byteCount, hipMemcpyDeviceToHost));
for (int i = 0; i < dataset.numDevices; i++)
memcpy(dataset.expected[i], result, dataset.NumBytes());
free(result);
}
};
}
#endif
@@ -1,60 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_AllGatherMultiProcess.hpp"
namespace CorrectnessTests
{
TEST_P(AllGatherMultiProcessCorrectnessTest, Correctness)
{
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllGather);
std::vector<int> pids(numDevices);
int gpu = -1;
for (int i = 0; i < numDevices; i++)
{
gpu++;
int pid = fork();
if (pid == 0)
{
bool pass;
TestAllGather(gpu, *dataset, pass);
TerminateChildProcess(pass);
}
else
{
pids[gpu] = pid;
}
}
ValidateProcesses(pids);
dataset->ReleaseRootProcess();
}
INSTANTIATE_TEST_SUITE_P(AllGatherMultiProcessCorrectnessSweep,
AllGatherMultiProcessCorrectnessTest,
testing::Combine(
// Reduction operator (not used)
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(3072, 3145728),
// Number of devices
testing::Values(2,3,4,8),
// In-place or not
testing::Values(false, true),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
@@ -1,81 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_ALLGATHER_MULTI_PROCESS_HPP
#define TEST_ALLGATHER_MULTI_PROCESS_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class AllGatherMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, int const numDevices, std::vector<int> const& ranks)
{
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
for (int i = 0; i < ranks.size(); i++)
{
int rank = ranks[i];
HIP_CALL(hipMemcpy(static_cast<char*>(dataset.expected[0]) + rank * byteCount, (int8_t *)dataset.inputs[rank] + (rank * byteCount),
byteCount, hipMemcpyDeviceToHost));
}
barrier.Wait();
// Rank 0 sends answer to other ranks
for (int i = 0; i < ranks.size(); i++)
{
int rank = ranks[i];
if (rank == 0)
{
for (int i = 0; i < dataset.numDevices; i++)
{
if (i == rank) continue;
memcpy(dataset.expected[i], dataset.expected[0], dataset.NumBytes());
}
}
}
}
void TestAllGather(int rank, Dataset& dataset, bool& pass)
{
// Prepare input / output / expected results
SetUpPerProcess(rank, ncclCollAllGather, comms[rank], streams[rank], dataset);
if (numDevices > numDevicesAvailable || numElements % numDevices != 0)
{
pass = true;
return;
}
Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
// Prepare input / output / expected results
FillDatasetWithPattern(dataset, rank);
ComputeExpectedResults(dataset, barrier, numDevices, std::vector<int>(1, rank));
size_t const byteCount = dataset.NumBytes() / numDevices;
size_t const sendCount = dataset.numElements / numDevices;
// Launch the reduction (1 process per GPU)
ncclAllGather((int8_t *)dataset.inputs[rank] + (rank * byteCount),
dataset.outputs[rank], sendCount,
dataType, comms[rank], streams[rank]);
// Wait for reduction to complete
HIP_CALL(hipStreamSynchronize(streams[rank]));
// Check results
pass = ValidateResults(dataset, rank);
TearDownPerProcess(comms[rank], streams[rank]);
dataset.Release(rank);
}
};
}
#endif
-80
Näytä tiedosto
@@ -1,80 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_AllReduce.hpp"
namespace CorrectnessTests
{
TEST_P(AllReduceCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
// Prepare input / output / expected results
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
FillDatasetWithPattern(dataset);
ComputeExpectedResults(dataset, op);
// Launch the reduction (1 thread per GPU)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
numElements, dataType, op, comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(dataset);
dataset.Release();
}
#if defined(BUILD_ALLREDUCE_ONLY)
INSTANTIATE_TEST_SUITE_P(AllReduceCorrectnessSweep,
AllReduceCorrectnessTest,
testing::Combine(
// Reduction operator
testing::Values(ncclSum),
// Data types
testing::Values(ncclFloat32),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
CorrectnessTest::PrintToStringParamName());
#else
INSTANTIATE_TEST_SUITE_P(AllReduceCorrectnessSweep,
AllReduceCorrectnessTest,
testing::Combine(
// Reduction operator
testing::Values(ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
CorrectnessTest::PrintToStringParamName());
#endif
} // namespace
-83
Näytä tiedosto
@@ -1,83 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_ALLREDUCE_HPP
#define TEST_ALLREDUCE_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class AllReduceCorrectnessTest : public CorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op)
{
// Copy all inputs to expected arrays temporarily to perform reduction on host
for (int i = 0; i < dataset.numDevices; i++)
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
dataset.NumBytes(), hipMemcpyDeviceToHost));
// Allocate temporary host array to accumulate results
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
uint8_t* resultU1 = (uint8_t *)resultI1;
int32_t* resultI4 = (int32_t *)resultI1;
uint32_t* resultU4 = (uint32_t *)resultI1;
int64_t* resultI8 = (int64_t *)resultI1;
uint64_t* resultU8 = (uint64_t *)resultI1;
float* resultF4 = (float *)resultI1;
double* resultF8 = (double *)resultI1;
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
// Initialize the result with the first device's array
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
ncclRedOp_t red_op = ((op == ncclAvg) ? ncclSum : op);
// Perform reduction on the other device arrays
for (int i = 1; i < dataset.numDevices; i++)
{
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
uint8_t* arrayU1 = (uint8_t *)arrayI1;
int32_t* arrayI4 = (int32_t *)arrayI1;
uint32_t* arrayU4 = (uint32_t *)arrayI1;
int64_t* arrayI8 = (int64_t *)arrayI1;
uint64_t* arrayU8 = (uint64_t *)arrayI1;
float* arrayF4 = (float *)arrayI1;
double* arrayF8 = (double *)arrayI1;
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
for (int j = 0; j < dataset.numElements; j++)
{
switch (dataset.dataType)
{
case ncclInt8: resultI1[j] = ReduceOp(red_op, resultI1[j], arrayI1[j]); break;
case ncclUint8: resultU1[j] = ReduceOp(red_op, resultU1[j], arrayU1[j]); break;
case ncclInt32: resultI4[j] = ReduceOp(red_op, resultI4[j], arrayI4[j]); break;
case ncclUint32: resultU4[j] = ReduceOp(red_op, resultU4[j], arrayU4[j]); break;
case ncclInt64: resultI8[j] = ReduceOp(red_op, resultI8[j], arrayI8[j]); break;
case ncclUint64: resultU8[j] = ReduceOp(red_op, resultU8[j], arrayU8[j]); break;
case ncclFloat32: resultF4[j] = ReduceOp(red_op, resultF4[j], arrayF4[j]); break;
case ncclFloat64: resultF8[j] = ReduceOp(red_op, resultF8[j], arrayF8[j]); break;
case ncclBfloat16: resultB2[j] = ReduceOp(red_op, resultB2[j], arrayB2[j]); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
}
}
if (op == ncclAvg)
Average(dataset, resultI1);
// Copy results into expected arrays
for (int i = 0; i < dataset.numDevices; i++)
memcpy(dataset.expected[i], resultI1, dataset.NumBytes());
free(resultI1);
}
};
}
#endif
-138
Näytä tiedosto
@@ -1,138 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_AllReduceAbort.hpp"
#include "../include/comm.h"
#define NUM_ITER 8
#define FAKE_OP_COUNT NUM_ITER+1
namespace CorrectnessTests
{
#define HIPCHECK(cmd) \
do { \
hipError_t error = (cmd); \
if (error != hipSuccess) { \
std::cerr << "Encountered HIP error (" << error << ") at line " \
<< __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while (0)
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
TEST_P(AllReduceAbortTest, Correctness) {
if (numDevices > numDevicesAvailable) return;
// Prepare input / output / expected results
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
FillDatasetWithPattern(dataset);
int gpu = 0; // GPU number to trigger abort
ncclComm_t comm = comms[gpu];
HIPCHECK(hipSetDevice(gpu));
hipStream_t stream;
HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
struct ncclChannel* channel = comm->channels;
uint64_t **p_dev_head = (uint64_t **)((uint8_t*)(channel->devPeers + channel->ring.next) + offsetof(struct ncclPeer, send[0].conn.head));
uint64_t *real_head, *fake_head, *fake_h;
// get original head
HIPCHECK(hipMemcpy(&real_head, p_dev_head, sizeof(uint64_t*), hipMemcpyDefault));
// allocate and install fakes
HIPCHECK(hipHostMalloc(&fake_head, sizeof(uint64_t*), hipHostMallocMapped));
HIPCHECK(hipMemcpy(p_dev_head, &fake_head, sizeof(uint64_t*), hipMemcpyDefault));
*fake_head = 0;
// read back fakes to confirm
HIPCHECK(hipMemcpy(&fake_h, p_dev_head, sizeof(uint64_t*), hipMemcpyDefault));
//std::cerr << "[ ] replaced gpu " << gpu << " real_opCount = " << real_opCount << " to fake_opCount = " << fake_o << std::endl;
//std::cerr << "[ ] replaced gpu " << gpu << " real_head = " << real_head << " to fake_head = " << fake_h << std::endl;
// Perform a number of iterations and introduce abort
for (int j = 0; j < NUM_ITER; j++) {
//std::cerr << "[ ] iter = " << j << std::endl;
// Start a group call
ncclGroupStart();
for (int i = 0; i < numDevices; i++) {
ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
numElements, dataType, op, comms[i], streams[i]);
}
// Signal end of group call
ncclGroupEnd();
}
// Wait for reduction to complete
auto start = std::chrono::high_resolution_clock::now();
hipError_t hipErr;
int remaining = numDevices;
int* done = (int*)malloc(sizeof(int)*numDevices);
memset(done, 0, sizeof(int)*numDevices);
bool timeout = false, abort_called = false;
while (remaining) {
int idle = 1;
for (int i=0; i<numDevices; i++) {
if (done[i]) continue;
hipErr = hipStreamQuery(streams[i]);
if (hipErr == hipSuccess) {
done[i] = 1;
remaining--;
idle = 0;
continue;
}
#if NCCL_MAJOR >= 2
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
auto delta = std::chrono::high_resolution_clock::now() - start;
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
if (deltaSec > 10.0 && !timeout) {
std::cerr << "[ ] timeout condition, calling ncclCommAbort ... " << std::endl;
timeout = true;
}
ncclResult_t ncclAsyncErr;
ncclCommGetAsyncError(comms[i], &ncclAsyncErr);
if ((ncclAsyncErr != ncclSuccess || timeout) && !abort_called) {
// An asynchronous error happened. Stop the operation and destroy
// the communicator
std::cerr << "[ ] ncclAsyncErr = " << ncclAsyncErr << std::endl;
for (int i=0; i<numDevices; i++)
ncclCommAbort(comms[i]);
// Abort the perf test
abort_called = true;
break;
}
#endif
#endif
}
// We might want to let other threads (including NCCL threads) use the CPU.
if (idle) pthread_yield();
}
free(done);
HIPCHECK(hipHostFree(fake_head));
HIPCHECK(hipStreamDestroy(stream));
dataset.Release();
}
INSTANTIATE_TEST_SUITE_P(AllReduceAbortSweep,
AllReduceAbortTest,
testing::Combine(
// Reduction operator
testing::Values(ncclSum),
// Data types
testing::Values(ncclFloat32),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2, 4),
// In-place or not
testing::Values(false),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-20
Näytä tiedosto
@@ -1,20 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_ALLREDUCE_HPP
#define TEST_ALLREDUCE_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class AllReduceAbortTest : public CorrectnessTest
{
protected:
public:
};
}
#endif
-83
Näytä tiedosto
@@ -1,83 +0,0 @@
/*************************************************************************
* Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_AllReduceGroup.hpp"
namespace CorrectnessTests
{
// This tests aggregated AllReduce calls within a group
TEST_P(AllReduceGroupCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
// Prepare input / output / expected results
Dataset dataset1, dataset2, dataset3;
dataset1.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
dataset2.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
dataset3.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
FillDatasetWithPattern(dataset1);
FillDatasetWithPattern(dataset2);
FillDatasetWithPattern(dataset3);
ComputeExpectedResults(dataset1, op);
ComputeExpectedResults(dataset2, op);
ComputeExpectedResults(dataset3, op);
// Launch the reduction (1 thread per GPU)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclAllReduce(dataset1.inputs[i], dataset1.outputs[i], numElements, dataType, op, comms[i], streams[i]);
ncclAllReduce(dataset2.inputs[i], dataset2.outputs[i], numElements, dataType, op, comms[i], streams[i]);
ncclAllReduce(dataset3.inputs[i], dataset3.outputs[i], numElements, dataType, op, comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(dataset1);
ValidateResults(dataset2);
ValidateResults(dataset3);
dataset1.Release();
dataset2.Release();
dataset3.Release();
}
#if defined(BUILD_ALLREDUCE_ONLY)
INSTANTIATE_TEST_SUITE_P(AllReduceGroupCorrectnessSweep,
AllReduceGroupCorrectnessTest,
testing::Combine(
// Reduction operator
testing::Values(ncclSum),
// Data types
testing::Values(ncclFloat32),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
CorrectnessTest::PrintToStringParamName());
#else
INSTANTIATE_TEST_SUITE_P(AllReduceGroupCorrectnessSweep,
AllReduceGroupCorrectnessTest,
testing::Combine(
// Reduction operator
testing::Values(ncclSum),
// Data types
testing::Values(ncclFloat32, ncclFloat64),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
CorrectnessTest::PrintToStringParamName());
#endif
} // namespace
-79
Näytä tiedosto
@@ -1,79 +0,0 @@
/*************************************************************************
* Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_ALLREDUCEGROUP_HPP
#define TEST_ALLREDUCEGROUP_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class AllReduceGroupCorrectnessTest : public CorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op)
{
// Copy all inputs to expected arrays temporarily to perform reduction on host
for (int i = 0; i < dataset.numDevices; i++)
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
dataset.NumBytes(), hipMemcpyDeviceToHost));
// Allocate temporary host array to accumulate results
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
uint8_t* resultU1 = (uint8_t *)resultI1;
int32_t* resultI4 = (int32_t *)resultI1;
uint32_t* resultU4 = (uint32_t *)resultI1;
int64_t* resultI8 = (int64_t *)resultI1;
uint64_t* resultU8 = (uint64_t *)resultI1;
float* resultF4 = (float *)resultI1;
double* resultF8 = (double *)resultI1;
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
// Initialize the result with the first device's array
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
// Perform reduction on the other device arrays
for (int i = 1; i < dataset.numDevices; i++)
{
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
uint8_t* arrayU1 = (uint8_t *)arrayI1;
int32_t* arrayI4 = (int32_t *)arrayI1;
uint32_t* arrayU4 = (uint32_t *)arrayI1;
int64_t* arrayI8 = (int64_t *)arrayI1;
uint64_t* arrayU8 = (uint64_t *)arrayI1;
float* arrayF4 = (float *)arrayI1;
double* arrayF8 = (double *)arrayI1;
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
for (int j = 0; j < dataset.numElements; j++)
{
switch (dataset.dataType)
{
case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
}
}
// Copy results into expected arrays
for (int i = 0; i < dataset.numDevices; i++)
memcpy(dataset.expected[i], resultI1, dataset.NumBytes());
free(resultI1);
}
};
}
#endif
@@ -1,82 +0,0 @@
/*************************************************************************
* Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_AllReduceGroupMultiProcess.hpp"
namespace CorrectnessTests
{
TEST_P(AllReduceGroupMultiProcessCorrectnessTest, Correctness)
{
// Important: Make sure the order of ncclFunc_t's here match the order of ncclFunc_ts
// as they appear in TestGroupCalls()
std::vector<ncclFunc_t> ncclFuncs;
ncclFuncs.push_back(ncclCollAllReduce);
ncclFuncs.push_back(ncclCollAllReduce);
ncclFuncs.push_back(ncclCollAllReduce);
// Create multiple datasets for combined operation
std::vector<Dataset*> datasets(ncclFuncs.size());
for (int i = 0; i < datasets.size(); i++)
{
datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
}
int const numGpusPerProcess = 2;
int const numProcesses = numDevices / numGpusPerProcess;
std::vector<int> pids(numProcesses);
int process = -1;
for (int i = 0; i < numDevices; i+= numGpusPerProcess)
{
process++;
int pid = fork();
if (pid == 0)
{
int gpuIdx = i;
int maxIdx = gpuIdx + (numGpusPerProcess - 1) >= numDevices ? numDevices : gpuIdx + numGpusPerProcess;
std::vector<int> ranks;
for (; gpuIdx < maxIdx; gpuIdx++)
{
ranks.push_back(gpuIdx);
}
bool pass;
TestGroupCalls(process, ranks, datasets, ncclFuncs, pass);
TerminateChildProcess(pass);
}
else
{
pids[process] = pid;
}
}
ValidateProcesses(pids);
for (int i = 0; i < datasets.size(); i++)
{
datasets[i]->ReleaseRootProcess();
munmap(datasets[i], sizeof(Dataset));
}
}
INSTANTIATE_TEST_SUITE_P(AllReduceGroupMultiProcessCorrectnessSweep,
AllReduceGroupMultiProcessCorrectnessTest,
testing::Combine(
// Reduction operator (not used)
testing::Values(ncclSum),
// Data types
testing::Values(ncclFloat32,
ncclFloat64),
// Number of elements
testing::Values(3072, 3145728),
// Number of devices
testing::Values(4,8),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
CorrectnessTest::PrintToStringParamName());
} // namespace
@@ -1,105 +0,0 @@
/*************************************************************************
* Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_ALLREDUCEGROUP_MULTI_PROCESS_HPP
#define TEST_ALLREDUCEGROUP_MULTI_PROCESS_HPP
#include "CorrectnessTest.hpp"
#include "test_AllReduceMultiProcess.hpp"
#include <string>
namespace CorrectnessTests
{
class AllReduceGroupMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
{
public:
void TestGroupCalls(int process, std::vector<int> const& ranks, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs, bool& pass)
{
ncclGroupStart();
for (int i = 0; i < ranks.size(); i++)
{
SetUpPerProcess(ranks[i], funcs, comms[ranks[i]], streams[ranks[i]], datasets);
if (numDevices > numDevicesAvailable)
{
break;
}
}
ncclGroupEnd();
if (numDevices > numDevicesAvailable)
{
pass = true;
return;
}
int numProcesses = numDevices / ranks.size();
Barrier barrier(process, numProcesses, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
for (int i = 0; i < ranks.size(); i++)
{
for (int j = 0; j < datasets.size(); j++)
{
FillDatasetWithPattern(*datasets[j], ranks[i]);
}
}
int const root = 0;
for (int i = 0; i < 3; i++)
{
AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[i], barrier, op, ranks);
}
barrier.Wait();
size_t const byteCount = datasets[0]->NumBytes() / numDevices;
size_t const elemCount = numElements / numDevices;
ncclGroupStart();
// AllReduce
for (int i = 0; i < ranks.size(); i++)
{
int rank = ranks[i];
for (int j = 0; j < 3; j++)
{
ncclAllReduce(datasets[j]->inputs[rank], datasets[j]->outputs[rank],
numElements, dataType, op, comms[rank], streams[rank]);
}
}
// Signal end of group call
ncclGroupEnd();
for (int i = 0; i < ranks.size(); i++)
{
HIP_CALL(hipSetDevice(ranks[i]));
HIP_CALL(hipStreamSynchronize(streams[ranks[i]]));
}
for (int i = 0; i < funcs.size(); i++)
{
for (int j = 0; j < ranks.size(); j++)
{
pass = ValidateResults(*datasets[i], ranks[j], root);
if (!pass)
{
break;
}
}
barrier.Wait();
for (int j = 0; j < ranks.size(); j++)
{
datasets[i]->Release(ranks[j]);
}
}
for (int i = 0; i < ranks.size(); i++)
{
TearDownPerProcess(comms[ranks[i]], streams[ranks[i]]);
}
}
};
}
#endif
@@ -1,61 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_AllReduceMultiProcess.hpp"
namespace CorrectnessTests
{
TEST_P(AllReduceMultiProcessCorrectnessTest, Correctness)
{
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
std::vector<int> pids(numDevices);
int gpu = -1;
for (int i = 0; i < numDevices; i++)
{
gpu++;
int pid = fork();
if (pid == 0)
{
bool pass;
TestAllReduce(gpu, *dataset, pass);
TerminateChildProcess(pass);
}
else
{
pids[gpu] = pid;
}
}
ValidateProcesses(pids);
dataset->ReleaseRootProcess();
}
INSTANTIATE_TEST_SUITE_P(AllReduceMultiProcessCorrectnessSweep,
AllReduceMultiProcessCorrectnessTest,
testing::Combine(
// Reduction operator
testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,8),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-117
Näytä tiedosto
@@ -1,117 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_ALLREDUCE_MULTI_PROCESS_HPP
#define TEST_ALLREDUCE_MULTI_PROCESS_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class AllReduceMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, std::vector<int> const& ranks)
{
// Copy all inputs to expected arrays temporarily to perform reduction on host
for (int i = 0; i < ranks.size(); i++)
{
int rank = ranks[i];
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
dataset.NumBytes(), hipMemcpyDeviceToHost));
}
barrier.Wait();
// Allocate temporary host array to accumulate results
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
uint8_t* resultU1 = (uint8_t *)resultI1;
int32_t* resultI4 = (int32_t *)resultI1;
uint32_t* resultU4 = (uint32_t *)resultI1;
int64_t* resultI8 = (int64_t *)resultI1;
uint64_t* resultU8 = (uint64_t *)resultI1;
float* resultF4 = (float *)resultI1;
double* resultF8 = (double *)resultI1;
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
// Initialize the result with the first device's array
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
barrier.Wait();
// Perform reduction
for (int i = 1; i < dataset.numDevices; i++)
{
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
uint8_t* arrayU1 = (uint8_t *)arrayI1;
int32_t* arrayI4 = (int32_t *)arrayI1;
uint32_t* arrayU4 = (uint32_t *)arrayI1;
int64_t* arrayI8 = (int64_t *)arrayI1;
uint64_t* arrayU8 = (uint64_t *)arrayI1;
float* arrayF4 = (float *)arrayI1;
double* arrayF8 = (double *)arrayI1;
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
for (int j = 0; j < dataset.numElements; j++)
{
switch (dataset.dataType)
{
case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
}
}
barrier.Wait();
// Copy results into expected array
for (int i = 0; i < ranks.size(); i++)
{
int rank = ranks[i];
memcpy(dataset.expected[rank], resultI1, dataset.NumBytes());
}
free(resultI1);
}
void TestAllReduce(int rank, Dataset& dataset, bool& pass)
{
SetUpPerProcess(rank, ncclCollAllReduce, comms[rank], streams[rank], dataset);
if (numDevices > numDevicesAvailable)
{
pass = true;
return;
}
Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
// Prepare input / output / expected results
FillDatasetWithPattern(dataset, rank);
ComputeExpectedResults(dataset, barrier, op, std::vector<int>(1, rank));
// Launch the reduction
ncclAllReduce(dataset.inputs[rank], dataset.outputs[rank],
numElements, dataType, op, comms[rank], streams[rank]);
// Wait for reduction to complete
HIP_CALL(hipStreamSynchronize(streams[rank]));
// Check results
pass = ValidateResults(dataset, rank);
TearDownPerProcess(comms[rank], streams[rank]);
dataset.Release(rank);
}
};
}
#endif
-67
Näytä tiedosto
@@ -1,67 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_AllToAll.hpp"
namespace CorrectnessTests
{
TEST_P(AllToAllCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
// Allocate data
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllToAll);
// Prepare input / output / expected results
FillDatasetWithPattern(dataset);
ComputeExpectedResults(dataset);
// Launch the reduction (1 thread per GPU)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclAllToAll(dataset.inputs[i],
dataset.outputs[i],
numElements, dataType,
comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(dataset);
dataset.Release();
}
INSTANTIATE_TEST_SUITE_P(AllToAllCorrectnessSweep,
AllToAllCorrectnessTest,
testing::Combine(
// Reduction operator is not used
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-26
Näytä tiedosto
@@ -1,26 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_ALLTOALL_HPP
#define TEST_ALLTOALL_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class AllToAllCorrectnessTest : public CorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset)
{
for (int i = 0; i < dataset.numDevices; i++)
for (int j = 0; j < dataset.numDevices; j++)
HIP_CALL(hipMemcpy((int8_t *)dataset.expected[i]+dataset.NumBytes()*j, (int8_t *)dataset.inputs[j]+dataset.NumBytes()*i,
dataset.NumBytes(), hipMemcpyDeviceToHost));
}
};
}
#endif
-61
Näytä tiedosto
@@ -1,61 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_AllToAllMultiProcess.hpp"
namespace CorrectnessTests
{
TEST_P(AllToAllMultiProcessCorrectnessTest, Correctness)
{
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllToAll);
std::vector<int> pids(numDevices);
int gpu = -1;
for (int i = 0; i < numDevices; i++)
{
gpu++;
int pid = fork();
if (pid == 0)
{
bool pass;
TestAllToAll(gpu, *dataset, pass);
TerminateChildProcess(pass);
}
else
{
pids[gpu] = pid;
}
}
ValidateProcesses(pids);
dataset->ReleaseRootProcess();
}
INSTANTIATE_TEST_SUITE_P(AllToAllMultiProcessCorrectnessSweep,
AllToAllMultiProcessCorrectnessTest,
testing::Combine(
// Reduction operator is not used
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,8),
// In-place or not
testing::Values(false),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-61
Näytä tiedosto
@@ -1,61 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_ALLTOALL_MULTI_PROCESS_HPP
#define TEST_ALLTOALL_MULTI_PROCESS_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class AllToAllMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, std::vector<int> const& ranks)
{
for (int i = 0; i < ranks.size(); i++)
{
int rank = ranks[i];
for (int j = 0; j < dataset.numDevices; j++)
{
HIP_CALL(hipMemcpy((int8_t *)dataset.expected[j]+dataset.NumBytes()*rank, (int8_t *)dataset.inputs[rank]+dataset.NumBytes()*j,
dataset.NumBytes(), hipMemcpyDeviceToHost));
}
}
}
void TestAllToAll(int rank, Dataset& dataset, bool& pass)
{
SetUpPerProcess(rank, ncclCollAllToAll, comms[rank], streams[rank], dataset);
if (numDevices > numDevicesAvailable)
{
pass = true;
return;
}
// Prepare input / output / expected results
FillDatasetWithPattern(dataset, rank);
ComputeExpectedResults(dataset, std::vector<int>(1, rank));
// Launch the reduction
ncclAllToAll(dataset.inputs[rank],
dataset.outputs[rank],
numElements, dataType,
comms[rank], streams[rank]);
// Wait for reduction to complete
HIP_CALL(hipStreamSynchronize(streams[rank]));
// Check results
pass = ValidateResults(dataset, rank);
TearDownPerProcess(comms[rank], streams[rank]);
dataset.Release(rank);
}
};
}
#endif
-75
Näytä tiedosto
@@ -1,75 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_AllToAllv.hpp"
namespace CorrectnessTests
{
TEST_P(AllToAllvCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
// Allocate data
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllToAll);
// Prepare input / output / expected results
FillDatasetWithPattern(dataset);
ComputeExpectedResults(dataset);
size_t chunksize = numElements*2/numDevices;
#define MAX_ALLTOALLV_RANKS 16
static size_t sendcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS];
// Launch the reduction (1 thread per GPU)
ncclGroupStart();
for (int r = 0; r < numDevices; r++) {
size_t disp = 0;
for (int i = 0; i < numDevices; i++) {
size_t scount = ((i+r)%numDevices)*chunksize;
if (i+r == numDevices-1)
scount += (numElements*numDevices-chunksize*(numDevices-1)*numDevices/2);
sendcounts[i+r*MAX_ALLTOALLV_RANKS] = recvcounts[i+r*MAX_ALLTOALLV_RANKS] = scount;
sdispls[i+r*MAX_ALLTOALLV_RANKS] = rdispls[i+r*MAX_ALLTOALLV_RANKS] = disp;
disp += scount;
}
ncclAllToAllv((char*)dataset.inputs[r], sendcounts+r*MAX_ALLTOALLV_RANKS, sdispls+r*MAX_ALLTOALLV_RANKS,
(char*)dataset.outputs[r], recvcounts+r*MAX_ALLTOALLV_RANKS, rdispls+r*MAX_ALLTOALLV_RANKS, dataType, comms[r], streams[r]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(dataset);
dataset.Release();
}
INSTANTIATE_TEST_SUITE_P(AllToAllvCorrectnessSweep,
AllToAllvCorrectnessTest,
testing::Combine(
// Reduction operator is not used
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(2520, 3026520),
// Number of devices
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-44
Näytä tiedosto
@@ -1,44 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_ALLTOALLV_HPP
#define TEST_ALLTOALLV_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class AllToAllvCorrectnessTest : public CorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset)
{
for (int i = 0; i < dataset.numDevices; i++) {
size_t rdisp = 0;
size_t chunksize = dataset.numElements*2/dataset.numDevices;
for (int j = 0; j < dataset.numDevices; j++) {
size_t scount = 0, rcount = ((j+i)%dataset.numDevices)*chunksize;
if (j+i == dataset.numDevices-1)
rcount += (dataset.numElements*dataset.numDevices-chunksize*(dataset.numDevices-1)*dataset.numDevices/2);
size_t sdisp = 0;
for (int k=0; k<dataset.numDevices; k++) {
scount = ((k+j)%dataset.numDevices)*chunksize;
if (k+j == dataset.numDevices-1)
scount += (dataset.numElements*dataset.numDevices-chunksize*(dataset.numDevices-1)*dataset.numDevices/2);
if (k == i)
break;
sdisp += scount;
}
HIP_CALL(hipMemcpy((int8_t *)dataset.expected[i]+rdisp*DataTypeToBytes(dataset.dataType),
(int8_t *)dataset.inputs[j]+sdisp*DataTypeToBytes(dataset.dataType),
rcount*DataTypeToBytes(dataset.dataType), hipMemcpyDeviceToHost));
rdisp += rcount;
}
}
}
};
}
#endif
-71
Näytä tiedosto
@@ -1,71 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_Broadcast.hpp"
namespace CorrectnessTests
{
TEST_P(BroadcastCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
// Allocate data
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollBroadcast);
// Test each possible root
for (int root = 0; root < numDevices; root++)
{
// Prepare input / output / expected results
FillDatasetWithPattern(dataset);
ComputeExpectedResults(dataset, root);
// Launch the reduction (1 thread per GPU)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclBroadcast(dataset.inputs[i],
dataset.outputs[i],
numElements, dataType,
root, comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(dataset);
}
dataset.Release();
}
INSTANTIATE_TEST_SUITE_P(BroadcastCorrectnessSweep,
BroadcastCorrectnessTest,
testing::Combine(
// Reduction operator is not used
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-25
Näytä tiedosto
@@ -1,25 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_BROADCAST_HPP
#define TEST_BROADCAST_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class BroadcastCorrectnessTest : public CorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, int const root)
{
for (int i = 0; i < dataset.numDevices; i++)
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[root],
dataset.NumBytes(), hipMemcpyDeviceToHost));
}
};
}
#endif
-140
Näytä tiedosto
@@ -1,140 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_BroadcastAbort.hpp"
#include "../include/comm.h"
#define NUM_ITER 8
#define FAKE_OP_COUNT NUM_ITER+1
namespace CorrectnessTests
{
#define HIPCHECK(cmd) \
do { \
hipError_t error = (cmd); \
if (error != hipSuccess) { \
std::cerr << "Encountered HIP error (" << error << ") at line " \
<< __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while (0)
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
TEST_P(BroadcastAbortTest, Correctness) {
if (numDevices > numDevicesAvailable) return;
// Prepare input / output / expected results
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollBroadcast);
FillDatasetWithPattern(dataset);
int root = 0;
int gpu = 0; // GPU number to trigger abort
ncclComm_t comm = comms[gpu];
HIPCHECK(hipSetDevice(gpu));
hipStream_t stream;
HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
struct ncclChannel* channel = comm->channels;
uint64_t **p_dev_head = (uint64_t **)((uint8_t*)(channel->devPeers + channel->ring.next) + offsetof(struct ncclPeer, send[0].conn.head));
uint64_t *real_head, *fake_head, *fake_h;
// get original head
HIPCHECK(hipMemcpy(&real_head, p_dev_head, sizeof(uint64_t*), hipMemcpyDefault));
// allocate and install fakes
HIPCHECK(hipHostMalloc(&fake_head, sizeof(uint64_t*), hipHostMallocMapped));
HIPCHECK(hipMemcpy(p_dev_head, &fake_head, sizeof(uint64_t*), hipMemcpyDefault));
*fake_head = 0;
// read back fakes to confirm
HIPCHECK(hipMemcpy(&fake_h, p_dev_head, sizeof(uint64_t*), hipMemcpyDefault));
//std::cerr << "[ ] replaced gpu " << gpu << " real_head = " << real_head << " to fake_head = " << fake_h << std::endl;
// Perform a number of iterations and introduce abort
for (int j = 0; j < NUM_ITER; j++) {
//std::cerr << "[ ] iter = " << j << std::endl;
// Start a group call
ncclGroupStart();
for (int i = 0; i < numDevices; i++) {
ncclBroadcast(dataset.inputs[i],
dataset.outputs[i],
numElements, dataType,
root, comms[i], streams[i]);
}
// Signal end of group call
ncclGroupEnd();
}
// Wait for reduction to complete
auto start = std::chrono::high_resolution_clock::now();
hipError_t hipErr;
int remaining = numDevices;
int* done = (int*)malloc(sizeof(int)*numDevices);
memset(done, 0, sizeof(int)*numDevices);
bool timeout = false, abort_called = false;
while (remaining) {
int idle = 1;
for (int i=0; i<numDevices; i++) {
if (done[i]) continue;
hipErr = hipStreamQuery(streams[i]);
if (hipErr == hipSuccess) {
done[i] = 1;
remaining--;
idle = 0;
continue;
}
#if NCCL_MAJOR >= 2
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
auto delta = std::chrono::high_resolution_clock::now() - start;
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
if (deltaSec > 10.0 && !timeout) {
std::cerr << "[ ] timeout condition, calling ncclCommAbort ... " << std::endl;
timeout = true;
}
ncclResult_t ncclAsyncErr;
ncclCommGetAsyncError(comms[i], &ncclAsyncErr);
if ((ncclAsyncErr != ncclSuccess || timeout) && !abort_called) {
// An asynchronous error happened. Stop the operation and destroy
// the communicator
std::cerr << "[ ] ncclAsyncErr = " << ncclAsyncErr << std::endl;
for (int i=0; i<numDevices; i++)
ncclCommAbort(comms[i]);
// Abort the perf test
abort_called = true;
break;
}
#endif
#endif
}
// We might want to let other threads (including NCCL threads) use the CPU.
if (idle) pthread_yield();
}
free(done);
HIPCHECK(hipHostFree(fake_head));
HIPCHECK(hipStreamDestroy(stream));
dataset.Release();
}
INSTANTIATE_TEST_SUITE_P(BroadcastAbortSweep,
BroadcastAbortTest,
testing::Combine(
// Reduction operator
testing::Values(ncclSum),
// Data types
testing::Values(ncclFloat32),
// Number of elements
testing::Values(1048576),
// Number of devices
testing::Values(2, 4),
// In-place or not
testing::Values(false),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-20
Näytä tiedosto
@@ -1,20 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_ALLREDUCE_HPP
#define TEST_ALLREDUCE_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class BroadcastAbortTest : public CorrectnessTest
{
protected:
public:
};
}
#endif
@@ -1,68 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_BroadcastMultiProcess.hpp"
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <iostream>
namespace CorrectnessTests
{
TEST_P(BroadcastMultiProcessCorrectnessTest, Correctness)
{
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollBroadcast);
std::vector<int> pids(numDevices);
int gpu = -1;
for (int i = 0; i < numDevices; i++)
{
gpu++;
int pid = fork();
if (pid == 0)
{
bool pass;
TestBroadcast(gpu, *dataset, pass);
TerminateChildProcess(pass);
}
else
{
pids[gpu] = pid;
}
}
ValidateProcesses(pids);
dataset->ReleaseRootProcess();
}
INSTANTIATE_TEST_SUITE_P(BroadcastMultiProcessCorrectnessSweep,
BroadcastMultiProcessCorrectnessTest,
testing::Combine(
// Reduction operator is not used
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,8),
// In-place or not
testing::Values(false, true),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
@@ -1,77 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_BROADCAST_MULTI_PROCESS_HPP
#define TEST_BROADCAST_MULTI_PROCESS_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class BroadcastMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, int const root, std::vector<int> const& ranks)
{
for (int h = 0; h < ranks.size(); h++)
{
int rank = ranks[h];
// Root has the answer; share it via host memcpy's
if (rank == root)
{
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
dataset.NumBytes(), hipMemcpyDeviceToHost));
for (int i = 0; i < dataset.numDevices; i++)
{
if (i == rank) continue;
memcpy(dataset.expected[i], dataset.expected[root], dataset.NumBytes());
}
break;
}
}
}
void TestBroadcast(int rank, Dataset& dataset, bool& pass)
{
SetUpPerProcess(rank, ncclCollBroadcast, comms[rank], streams[rank], dataset);
if (numDevices > numDevicesAvailable)
{
pass = true;
return;
}
Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
// Test each possible root
for (int root = 0; root < numDevices; root++)
{
// Prepare input / output / expected results
FillDatasetWithPattern(dataset, rank);
ComputeExpectedResults(dataset, root, std::vector<int>(1, rank));
// Launch the reduction (1 process per GPU)
ncclResult_t res = ncclBroadcast(dataset.inputs[rank],
dataset.outputs[rank],
numElements, dataType,
root, comms[rank], streams[rank]);
// Wait for reduction to complete
HIP_CALL(hipStreamSynchronize(streams[rank]));
// Check results
pass = ValidateResults(dataset, rank);
// Ensure all processes have finished current iteration before proceeding
barrier.Wait();
}
TearDownPerProcess(comms[rank], streams[rank]);
dataset.Release(rank);
}
};
}
#endif
-129
Näytä tiedosto
@@ -1,129 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_CombinedCalls.hpp"
#include "test_AllGather.hpp"
#include "test_AllReduce.hpp"
#include "test_Broadcast.hpp"
#include "test_Reduce.hpp"
#include "test_ReduceScatter.hpp"
#include "test_Scatter.hpp"
namespace CorrectnessTests
{
TEST_P(CombinedCallsCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
// Create multiple datasets for combined operation
std::vector<Dataset> datasets(5);
std::vector<ncclFunc_t> ncclFuncs(5);
ncclFuncs.push_back(ncclCollAllGather);
ncclFuncs.push_back(ncclCollAllReduce);
ncclFuncs.push_back(ncclCollBroadcast);
ncclFuncs.push_back(ncclCollReduce);
ncclFuncs.push_back(ncclCollReduceScatter);
// Adjust numElements to be multiple of numDevices
numElements = (numElements/numDevices)*numDevices;
for (int i = 0; i < datasets.size(); i++)
{
datasets[i].Initialize(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
FillDatasetWithPattern(datasets[i]);
}
Dataset scatter_dataset;
scatter_dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollScatter);
FillDatasetWithPattern(scatter_dataset);
// Compute expected results for each dataset in combined
int const root = 0;
AllGatherCorrectnessTest::ComputeExpectedResults(datasets[0]);
AllReduceCorrectnessTest::ComputeExpectedResults(datasets[1], op);
BroadcastCorrectnessTest::ComputeExpectedResults(datasets[2], root);
ReduceCorrectnessTest::ComputeExpectedResults(datasets[3], op, root);
ReduceScatterCorrectnessTest::ComputeExpectedResults(datasets[4], op);
ScatterCorrectnessTest::ComputeExpectedResults(scatter_dataset, root);
size_t const byteCount = datasets[0].NumBytes() / numDevices;
size_t const elemCount = numElements / numDevices;
for (int j = 0; j < 10; j++)
{
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclScatter(scatter_dataset.inputs[i],
scatter_dataset.outputs[i],
numElements, dataType,
root, comms[i], streams[i]);
}
ncclGroupEnd();
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
datasets[0].outputs[i], elemCount,
dataType, comms[i], streams[i]);
ncclAllReduce(datasets[1].inputs[i], datasets[1].outputs[i],
numElements, dataType, op, comms[i], streams[i]);
ncclBroadcast(datasets[2].inputs[i],
datasets[2].outputs[i],
numElements, dataType,
root, comms[i], streams[i]);
ncclReduce(datasets[3].inputs[i],
datasets[3].outputs[i],
numElements, dataType, op,
root, comms[i], streams[i]);
ncclReduceScatter(datasets[4].inputs[i],
(int8_t *)datasets[4].outputs[i] + (i * byteCount),
elemCount, dataType, op,
comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
// Check results for each collective in the combined
for (int i = 0; i < 5; i++)
ValidateResults(datasets[i]);
ValidateResults(scatter_dataset);
}
for (int i = 0; i < 5; i++)
datasets[i].Release();
scatter_dataset.Release();
}
INSTANTIATE_TEST_SUITE_P(CombinedCallsCorrectnessSweep,
CombinedCallsCorrectnessTest,
testing::Combine(
// Reduction operator (not used)
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(2520, 3026520),
// Number of devices
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1", "RCCL_P2P_NET_DISABLE=0", "RCCL_P2P_NET_DISABLE=1")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-17
Näytä tiedosto
@@ -1,17 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_COMBINEDCALLS_HPP
#define TEST_COMBINEDCALLS_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class CombinedCallsCorrectnessTest : public CorrectnessTest {};
}
#endif
@@ -1,81 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_CombinedCallsMultiProcess.hpp"
namespace CorrectnessTests
{
TEST_P(CombinedCallsMultiProcessCorrectnessTest, Correctness)
{
// Important: Make sure the order of ncclFunc_t's here match the order of ncclFunc_ts
// as they appear in TestCombinedCalls()
std::vector<ncclFunc_t> ncclFuncs;
ncclFuncs.push_back(ncclCollAllGather);
ncclFuncs.push_back(ncclCollAllReduce);
ncclFuncs.push_back(ncclCollBroadcast);
ncclFuncs.push_back(ncclCollReduce);
ncclFuncs.push_back(ncclCollReduceScatter);
// Create multiple datasets for combined operation
std::vector<Dataset*> datasets(ncclFuncs.size());
for (int i = 0; i < datasets.size(); i++)
{
datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
}
std::vector<int> pids(numDevices);
int gpu = -1;
for (int i = 0; i < numDevices; i++)
{
gpu++;
int pid = fork();
if (pid == 0)
{
bool pass;
TestCombinedCalls(gpu, datasets, ncclFuncs, pass);
TerminateChildProcess(pass);
}
else
{
pids[gpu] = pid;
}
}
ValidateProcesses(pids);
for (int i = 0; i < datasets.size(); i++)
{
datasets[i]->ReleaseRootProcess();
munmap(datasets[i], sizeof(Dataset));
}
}
INSTANTIATE_TEST_SUITE_P(CombinedCallsMultiProcessCorrectnessSweep,
CombinedCallsMultiProcessCorrectnessTest,
testing::Combine(
// Reduction operator (not used)
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(3072, 3145728),
// Number of devices
testing::Values(2,3,4,8),
// In-place or not
testing::Values(false, true),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
@@ -1,97 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_COMBINEDCALLS_MULTI_PROCESS_HPP
#define TEST_COMBINEDCALLS_MULTI_PROCESS_HPP
#include "CorrectnessTest.hpp"
#include "test_AllGatherMultiProcess.hpp"
#include "test_AllReduceMultiProcess.hpp"
#include "test_BroadcastMultiProcess.hpp"
#include "test_ReduceMultiProcess.hpp"
#include "test_ReduceScatterMultiProcess.hpp"
namespace CorrectnessTests
{
class CombinedCallsMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
{
public:
void TestCombinedCalls(int rank, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs, bool& pass)
{
SetUpPerProcess(rank, funcs, comms[rank], streams[rank], datasets);
if (numDevices > numDevicesAvailable)
{
pass = true;
return;
}
Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
// Compute expected results for each dataset in combined
int const root = 0;
std::vector<int> ranks(1, rank);
AllGatherMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[0], barrier, numDevices, ranks);
AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[1], barrier, op, ranks);
BroadcastMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[2], root, ranks);
ReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[3], barrier, op, root, ranks);
ReduceScatterMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[4], barrier, op, ranks);
size_t const byteCount = datasets[0]->NumBytes() / numDevices;
size_t const elemCount = numElements / numDevices;
ncclAllGather((int8_t *)datasets[0]->inputs[rank] + (rank * byteCount),
datasets[0]->outputs[rank], elemCount,
dataType, comms[rank], streams[rank]);
ncclAllReduce(datasets[1]->inputs[rank], datasets[1]->outputs[rank],
numElements, dataType, op, comms[rank], streams[rank]);
ncclBroadcast(datasets[2]->inputs[rank],
datasets[2]->outputs[rank],
numElements, dataType,
root, comms[rank], streams[rank]);
ncclReduce(datasets[3]->inputs[rank],
datasets[3]->outputs[rank],
numElements, dataType, op,
root, comms[rank], streams[rank]);
ncclReduceScatter(datasets[4]->inputs[rank],
(int8_t *)datasets[4]->outputs[rank] + (rank * byteCount),
elemCount, dataType, op,
comms[rank], streams[rank]);
// Wait for reduction to complete
HIP_CALL(hipStreamSynchronize(streams[rank]));
for (int i = 0; i < funcs.size(); i++)
{
for (int j = 0; j < ranks.size(); j++)
{
pass = ValidateResults(*datasets[i], ranks[j], root);
if (!pass)
{
break;
}
}
barrier.Wait();
for (int j = 0; j < ranks.size(); j++)
{
datasets[i]->Release(ranks[j]);
}
}
for (int i = 0; i < ranks.size(); i++)
{
TearDownPerProcess(comms[ranks[i]], streams[ranks[i]]);
}
}
};
}
#endif
-71
Näytä tiedosto
@@ -1,71 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_Gather.hpp"
namespace CorrectnessTests
{
TEST_P(GatherCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
// Allocate data
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollGather);
// Test each possible root
for (int root = 0; root < numDevices; root++)
{
// Prepare input / output / expected results
FillDatasetWithPattern(dataset);
ComputeExpectedResults(dataset, root);
// Launch the reduction (1 thread per GPU)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclGather(dataset.inputs[i],
dataset.outputs[i],
numElements, dataType,
root, comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(dataset, root);
}
dataset.Release();
}
INSTANTIATE_TEST_SUITE_P(GatherCorrectnessSweep,
GatherCorrectnessTest,
testing::Combine(
// Reduction operator is not used
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-25
Näytä tiedosto
@@ -1,25 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_GATHER_HPP
#define TEST_GATHER_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class GatherCorrectnessTest : public CorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, int const root)
{
for (int i = 0; i < dataset.numDevices; i++)
HIP_CALL(hipMemcpy((int8_t *)dataset.expected[root]+dataset.NumBytes()*i, dataset.inputs[i],
dataset.NumBytes(), hipMemcpyDeviceToHost));
}
};
}
#endif
-61
Näytä tiedosto
@@ -1,61 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_GatherMultiProcess.hpp"
namespace CorrectnessTests
{
TEST_P(GatherMultiProcessCorrectnessTest, Correctness)
{
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollGather);
std::vector<int> pids(numDevices);
int gpu = -1;
for (int i = 0; i < numDevices; i++)
{
gpu++;
int pid = fork();
if (pid == 0)
{
bool pass;
TestGather(gpu, *dataset, pass);
TerminateChildProcess(pass);
}
else
{
pids[gpu] = pid;
}
}
ValidateProcesses(pids);
dataset->ReleaseRootProcess();
}
INSTANTIATE_TEST_SUITE_P(GatherMultiProcessCorrectnessSweep,
GatherMultiProcessCorrectnessTest,
testing::Combine(
// Reduction operator is not used
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,8),
// In-place or not
testing::Values(false),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-63
Näytä tiedosto
@@ -1,63 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_GATHER_MULTI_PROCESS_HPP
#define TEST_GATHER_MULTI_PROCESS_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class GatherMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, int const root, int const rank)
{
HIP_CALL(hipMemcpy((int8_t *)dataset.expected[root]+dataset.NumBytes()*rank, dataset.inputs[rank],
dataset.NumBytes(), hipMemcpyDeviceToHost));
}
void TestGather(int rank, Dataset& dataset, bool& pass)
{
SetUpPerProcess(rank, ncclCollGather, comms[rank], streams[rank], dataset);
if (numDevices > numDevicesAvailable)
{
pass = true;
return;
}
Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
// Test each possible root
for (int root = 0; root < numDevices; root++)
{
// Prepare input / output / expected results
FillDatasetWithPattern(dataset, rank);
ComputeExpectedResults(dataset, root, rank);
// Launch the reduction (1 process per GPU)
ncclGather(dataset.inputs[rank],
dataset.outputs[rank],
numElements, dataType,
root, comms[rank], streams[rank]);
// Wait for reduction to complete
HIP_CALL(hipStreamSynchronize(streams[rank]));
// Check results
pass = ValidateResults(dataset, rank, root);
// Ensure all processes have finished current iteration before proceeding
barrier.Wait();
}
TearDownPerProcess(comms[rank], streams[rank]);
dataset.Release(rank);
}
};
}
#endif
-130
Näytä tiedosto
@@ -1,130 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_GroupCalls.hpp"
#include "test_AllGather.hpp"
#include "test_AllReduce.hpp"
#include "test_Broadcast.hpp"
#include "test_Reduce.hpp"
#include "test_ReduceScatter.hpp"
namespace CorrectnessTests
{
TEST_P(GroupCallsCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
// Create multiple datasets for group operation
std::vector<Dataset> datasets(5);
std::vector<ncclFunc_t> ncclFuncs(5);
ncclFuncs.push_back(ncclCollAllGather);
ncclFuncs.push_back(ncclCollAllReduce);
ncclFuncs.push_back(ncclCollBroadcast);
ncclFuncs.push_back(ncclCollReduce);
ncclFuncs.push_back(ncclCollReduceScatter);
// Adjust numElements to be multiple of numDevices
numElements = (numElements/numDevices)*numDevices;
for (int i = 0; i < datasets.size(); i++)
{
datasets[i].Initialize(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
FillDatasetWithPattern(datasets[i]);
}
// Compute expected results for each dataset in group
int const root = 0;
AllGatherCorrectnessTest::ComputeExpectedResults(datasets[0]);
AllReduceCorrectnessTest::ComputeExpectedResults(datasets[1], op);
BroadcastCorrectnessTest::ComputeExpectedResults(datasets[2], root);
ReduceCorrectnessTest::ComputeExpectedResults(datasets[3], op, root);
ReduceScatterCorrectnessTest::ComputeExpectedResults(datasets[4], op);
// Start a group call
ncclGroupStart();
// AllGather
size_t const byteCount = datasets[0].NumBytes() / numDevices;
size_t const elemCount = numElements / numDevices;
for (int i = 0; i < numDevices; i++)
{
ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
datasets[0].outputs[i], elemCount,
dataType, comms[i], streams[i]);
}
// AllReduce
for (int i = 0; i < numDevices; i++)
{
ncclAllReduce(datasets[1].inputs[i], datasets[1].outputs[i],
numElements, dataType, op, comms[i], streams[i]);
}
// Broadcast
for (int i = 0; i < numDevices; i++)
{
ncclBroadcast(datasets[2].inputs[i],
datasets[2].outputs[i],
numElements, dataType,
root, comms[i], streams[i]);
}
// Reduce
for (int i = 0; i < numDevices; i++)
{
ncclReduce(datasets[3].inputs[i],
datasets[3].outputs[i],
numElements, dataType, op,
root, comms[i], streams[i]);
}
// ReduceScatter
for (int i = 0; i < numDevices; i++)
{
ncclReduceScatter(datasets[4].inputs[i],
(int8_t *)datasets[4].outputs[i] + (i * byteCount),
elemCount, dataType, op,
comms[i], streams[i]);
}
// Signal end of group call
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
// Check results for each collective in the group
for (int i = 0; i < 5; i++)
{
ValidateResults(datasets[i]);
datasets[i].Release();
}
}
INSTANTIATE_TEST_SUITE_P(GroupCallsCorrectnessSweep,
GroupCallsCorrectnessTest,
testing::Combine(
// Reduction operator (not used)
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(2520, 3026520),
// Number of devices
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-17
Näytä tiedosto
@@ -1,17 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_GROUPCALLS_HPP
#define TEST_GROUPCALLS_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class GroupCallsCorrectnessTest : public CorrectnessTest {};
}
#endif
@@ -1,92 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_GroupCallsMultiProcess.hpp"
namespace CorrectnessTests
{
TEST_P(GroupCallsMultiProcessCorrectnessTest, Correctness)
{
// Important: Make sure the order of ncclFunc_t's here match the order of ncclFunc_ts
// as they appear in TestGroupCalls()
std::vector<ncclFunc_t> ncclFuncs;
ncclFuncs.push_back(ncclCollAllGather);
ncclFuncs.push_back(ncclCollAllReduce);
ncclFuncs.push_back(ncclCollBroadcast);
ncclFuncs.push_back(ncclCollReduce);
ncclFuncs.push_back(ncclCollReduceScatter);
// Create multiple datasets for combined operation
std::vector<Dataset*> datasets(ncclFuncs.size());
for (int i = 0; i < datasets.size(); i++)
{
datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
}
int const numGpusPerProcess = 2;
int const numProcesses = numDevices / numGpusPerProcess;
std::vector<int> pids(numProcesses);
int process = -1;
for (int i = 0; i < numDevices; i+= numGpusPerProcess)
{
process++;
int pid = fork();
if (pid == 0)
{
int gpuIdx = i;
int maxIdx = gpuIdx + (numGpusPerProcess - 1) >= numDevices ? numDevices : gpuIdx + numGpusPerProcess;
std::vector<int> ranks;
for (; gpuIdx < maxIdx; gpuIdx++)
{
ranks.push_back(gpuIdx);
}
bool pass;
TestGroupCalls(process, ranks, datasets, ncclFuncs, pass);
TerminateChildProcess(pass);
}
else
{
pids[process] = pid;
}
}
ValidateProcesses(pids);
for (int i = 0; i < datasets.size(); i++)
{
datasets[i]->ReleaseRootProcess();
munmap(datasets[i], sizeof(Dataset));
}
}
INSTANTIATE_TEST_SUITE_P(GroupCallsMultiProcessCorrectnessSweep,
GroupCallsMultiProcessCorrectnessTest,
testing::Combine(
// Reduction operator (not used)
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(3072, 3145728),
// Number of devices
testing::Values(4,8),
// In-place or not
testing::Values(false, true),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-148
Näytä tiedosto
@@ -1,148 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_GROUPCALLS_MULTI_PROCESS_HPP
#define TEST_GROUPCALLS_MULTI_PROCESS_HPP
#include "CorrectnessTest.hpp"
#include "test_AllGatherMultiProcess.hpp"
#include "test_AllReduceMultiProcess.hpp"
#include "test_BroadcastMultiProcess.hpp"
#include "test_ReduceMultiProcess.hpp"
#include "test_ReduceScatterMultiProcess.hpp"
#include <string>
namespace CorrectnessTests
{
class GroupCallsMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
{
public:
void TestGroupCalls(int process, std::vector<int> const& ranks, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs, bool& pass)
{
ncclGroupStart();
for (int i = 0; i < ranks.size(); i++)
{
SetUpPerProcess(ranks[i], funcs, comms[ranks[i]], streams[ranks[i]], datasets);
if (numDevices > numDevicesAvailable)
{
break;
}
}
ncclGroupEnd();
if (numDevices > numDevicesAvailable)
{
pass = true;
return;
}
int numProcesses = numDevices / ranks.size();
Barrier barrier(process, numProcesses, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
for (int i = 0; i < ranks.size(); i++)
{
for (int j = 0; j < datasets.size(); j++)
{
FillDatasetWithPattern(*datasets[j], ranks[i]);
}
}
int const root = 0;
AllGatherMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[0], barrier, numDevices, ranks);
AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[1], barrier, op, ranks);
BroadcastMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[2], root, ranks);
ReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[3], barrier, op, root, ranks);
ReduceScatterMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[4], barrier, op, ranks);
barrier.Wait();
size_t const byteCount = datasets[0]->NumBytes() / numDevices;
size_t const elemCount = numElements / numDevices;
ncclGroupStart();
// AllGather
for (int i = 0; i < ranks.size(); i++)
{
int rank = ranks[i];
ncclAllGather((int8_t *)datasets[0]->inputs[rank] + (rank * byteCount),
datasets[0]->outputs[rank], elemCount,
dataType, comms[rank], streams[rank]);
}
// AllReduce
for (int i = 0; i < ranks.size(); i++)
{
int rank = ranks[i];
ncclAllReduce(datasets[1]->inputs[rank], datasets[1]->outputs[rank],
numElements, dataType, op, comms[rank], streams[rank]);
}
// Broadcast
for (int i = 0; i < ranks.size(); i++)
{
int rank = ranks[i];
ncclBroadcast(datasets[2]->inputs[rank],
datasets[2]->outputs[rank],
numElements, dataType,
root, comms[rank], streams[rank]);
}
// Reduce
for (int i = 0; i < ranks.size(); i++)
{
int rank = ranks[i];
ncclReduce(datasets[3]->inputs[rank],
datasets[3]->outputs[rank],
numElements, dataType, op,
root, comms[rank], streams[rank]);
}
// ReduceScatter
for (int i = 0; i < ranks.size(); i++)
{
int rank = ranks[i];
ncclReduceScatter(datasets[4]->inputs[rank],
(int8_t *)datasets[4]->outputs[rank] + (rank * byteCount),
elemCount, dataType, op,
comms[rank], streams[rank]);
}
// Signal end of group call
ncclGroupEnd();
for (int i = 0; i < ranks.size(); i++)
{
HIP_CALL(hipSetDevice(ranks[i]));
HIP_CALL(hipStreamSynchronize(streams[ranks[i]]));
}
for (int i = 0; i < funcs.size(); i++)
{
for (int j = 0; j < ranks.size(); j++)
{
pass = ValidateResults(*datasets[i], ranks[j], root);
if (!pass)
{
break;
}
}
barrier.Wait();
for (int j = 0; j < ranks.size(); j++)
{
datasets[i]->Release(ranks[j]);
}
}
for (int i = 0; i < ranks.size(); i++)
{
TearDownPerProcess(comms[ranks[i]], streams[ranks[i]]);
}
}
};
}
#endif
-71
Näytä tiedosto
@@ -1,71 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_Reduce.hpp"
namespace CorrectnessTests
{
TEST_P(ReduceCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
// Allocate data
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollReduce);
// Test each possible root
for (int root = 0; root < numDevices; root++)
{
// Prepare input / output / expected results
FillDatasetWithPattern(dataset);
ComputeExpectedResults(dataset, op, root);
// Launch the reduction (1 thread per GPU)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclReduce(dataset.inputs[i],
dataset.outputs[i],
numElements, dataType, op,
root, comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(dataset);
}
dataset.Release();
}
INSTANTIATE_TEST_SUITE_P(ReduceCorrectnessSweep,
ReduceCorrectnessTest,
testing::Combine(
// Reduction operator
testing::Values(ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-87
Näytä tiedosto
@@ -1,87 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_REDUCE_HPP
#define TEST_REDUCE_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class ReduceCorrectnessTest : public CorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op, int const root)
{
// Copy all inputs to expected arrays temporarily to perform reduction on host
for (int i = 0; i < dataset.numDevices; i++)
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
dataset.NumBytes(), hipMemcpyDeviceToHost));
// Allocate temporary host array to accumulate results
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
uint8_t* resultU1 = (uint8_t *)resultI1;
int32_t* resultI4 = (int32_t *)resultI1;
uint32_t* resultU4 = (uint32_t *)resultI1;
int64_t* resultI8 = (int64_t *)resultI1;
uint64_t* resultU8 = (uint64_t *)resultI1;
float* resultF4 = (float *)resultI1;
double* resultF8 = (double *)resultI1;
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
// Initialize the result with the first device's array
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
ncclRedOp_t red_op = ((op == ncclAvg) ? ncclSum : op);
// Perform reduction on the other device arrays
for (int i = 1; i < dataset.numDevices; i++)
{
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
uint8_t* arrayU1 = (uint8_t *)arrayI1;
int32_t* arrayI4 = (int32_t *)arrayI1;
uint32_t* arrayU4 = (uint32_t *)arrayI1;
int64_t* arrayI8 = (int64_t *)arrayI1;
uint64_t* arrayU8 = (uint64_t *)arrayI1;
float* arrayF4 = (float *)arrayI1;
double* arrayF8 = (double *)arrayI1;
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
for (int j = 0; j < dataset.numElements; j++)
{
switch (dataset.dataType)
{
case ncclInt8: resultI1[j] = ReduceOp(red_op, resultI1[j], arrayI1[j]); break;
case ncclUint8: resultU1[j] = ReduceOp(red_op, resultU1[j], arrayU1[j]); break;
case ncclInt32: resultI4[j] = ReduceOp(red_op, resultI4[j], arrayI4[j]); break;
case ncclUint32: resultU4[j] = ReduceOp(red_op, resultU4[j], arrayU4[j]); break;
case ncclInt64: resultI8[j] = ReduceOp(red_op, resultI8[j], arrayI8[j]); break;
case ncclUint64: resultU8[j] = ReduceOp(red_op, resultU8[j], arrayU8[j]); break;
case ncclFloat32: resultF4[j] = ReduceOp(red_op, resultF4[j], arrayF4[j]); break;
case ncclFloat64: resultF8[j] = ReduceOp(red_op, resultF8[j], arrayF8[j]); break;
case ncclBfloat16: resultB2[j] = ReduceOp(red_op, resultB2[j], arrayB2[j]); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
}
}
if (op == ncclAvg)
Average(dataset, resultI1);
// Copy results into expected arrays
for (int i = 0; i < dataset.numDevices; i++)
{
if (i == root)
memcpy(dataset.expected[root], resultI1, dataset.NumBytes());
else
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.outputs[i], dataset.NumBytes(), hipMemcpyDeviceToHost));
}
free(resultI1);
}
};
}
#endif
-61
Näytä tiedosto
@@ -1,61 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_ReduceMultiProcess.hpp"
namespace CorrectnessTests
{
TEST_P(ReduceMultiProcessCorrectnessTest, Correctness)
{
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollReduce);
std::vector<int> pids(numDevices);
int gpu = -1;
for (int i = 0; i < numDevices; i++)
{
gpu++;
int pid = fork();
if (pid == 0)
{
bool pass;
TestReduce(gpu, *dataset, pass);
TerminateChildProcess(pass);
}
else
{
pids[gpu] = pid;
}
}
ValidateProcesses(pids);
dataset->ReleaseRootProcess();
}
INSTANTIATE_TEST_SUITE_P(ReduceMultiProcessCorrectnessSweep,
ReduceMultiProcessCorrectnessTest,
testing::Combine(
// Reduction operator
testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,8),
// In-place or not
testing::Values(false, true),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-131
Näytä tiedosto
@@ -1,131 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_REDUCE_MULTI_PROCESS_HPP
#define TEST_REDUCE_MULTI_PROCESS_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class ReduceMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, int const root, std::vector<int> const& ranks)
{
// Copy all inputs to expected arrays temporarily to perform reduction on host
for (int i = 0; i < ranks.size(); i++)
{
int rank = ranks[i];
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
dataset.NumBytes(), hipMemcpyDeviceToHost));
}
barrier.Wait();
for (int h = 0; h < ranks.size(); h++)
{
int rank = ranks[h];
if (rank == root)
{
// Allocate temporary host array to accumulate results
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
uint8_t* resultU1 = (uint8_t *)resultI1;
int32_t* resultI4 = (int32_t *)resultI1;
uint32_t* resultU4 = (uint32_t *)resultI1;
int64_t* resultI8 = (int64_t *)resultI1;
uint64_t* resultU8 = (uint64_t *)resultI1;
float* resultF4 = (float *)resultI1;
double* resultF8 = (double *)resultI1;
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
// Initialize the result with the first device's array
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
// Perform reduction on the other device arrays
for (int i = 1; i < dataset.numDevices; i++)
{
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
uint8_t* arrayU1 = (uint8_t *)arrayI1;
int32_t* arrayI4 = (int32_t *)arrayI1;
uint32_t* arrayU4 = (uint32_t *)arrayI1;
int64_t* arrayI8 = (int64_t *)arrayI1;
uint64_t* arrayU8 = (uint64_t *)arrayI1;
float* arrayF4 = (float *)arrayI1;
double* arrayF8 = (double *)arrayI1;
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
for (int j = 0; j < dataset.numElements; j++)
{
switch (dataset.dataType)
{
case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
}
}
memcpy(dataset.expected[root], resultI1, dataset.NumBytes());
free(resultI1);
}
}
barrier.Wait();
for (int i = 0; i < ranks.size(); i++)
{
int rank = ranks[i];
if (rank != root)
{
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.outputs[rank], dataset.NumBytes(), hipMemcpyDeviceToHost));
}
}
}
void TestReduce(int rank, Dataset& dataset, bool& pass)
{
SetUpPerProcess(rank, ncclCollReduce, comms[rank], streams[rank], dataset);
if (numDevices > numDevicesAvailable)
{
pass = true;
return;
}
Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
// Test each possible root
for (int root = 0; root < numDevices; root++)
{
// Prepare input / output / expected results
FillDatasetWithPattern(dataset, rank);
ComputeExpectedResults(dataset, barrier, op, root, std::vector<int>(1, rank));
// Launch the reduction (1 process per GPU)
ncclResult_t res = ncclReduce(dataset.inputs[rank],
dataset.outputs[rank],
numElements, dataType, op,
root, comms[rank], streams[rank]);
// Wait for reduction to complete
HIP_CALL(hipStreamSynchronize(streams[rank]));
// Check results
pass = ValidateResults(dataset, rank);
// Ensure all processes have finished current iteration before proceeding
barrier.Wait();
}
TearDownPerProcess(comms[rank], streams[rank]);
dataset.Release(rank);
}
};
}
#endif
-71
Näytä tiedosto
@@ -1,71 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_ReduceScatter.hpp"
namespace CorrectnessTests
{
TEST_P(ReduceScatterCorrectnessTest, Correctness)
{
// Adjust numElements to be multiple of numDevices
numElements = (numElements/numDevices)*numDevices;
if (numDevices > numDevicesAvailable) return;
if (numElements % numDevices != 0) return;
// Prepare input / output / expected results
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollReduceScatter);
FillDatasetWithPattern(dataset);
ComputeExpectedResults(dataset, op);
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
size_t const recvCount = dataset.numElements / dataset.numDevices;
// Launch the reduction (1 thread per GPU)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclReduceScatter(dataset.inputs[i],
(int8_t *)dataset.outputs[i] + (i * byteCount),
recvCount, dataType, op,
comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(dataset);
dataset.Release();
}
INSTANTIATE_TEST_SUITE_P(ReduceScatterCorrectnessSweep,
ReduceScatterCorrectnessTest,
testing::Combine(
// Reduction operator
testing::Values(ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(2520, 3026520),
// Number of devices
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false, true),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-90
Näytä tiedosto
@@ -1,90 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_REDUCE_SCATTER_HPP
#define TEST_REDUCE_SCATTER_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class ReduceScatterCorrectnessTest : public CorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op)
{
// Copy all inputs to expected arrays temporarily to perform reduction on host
for (int i = 0; i < dataset.numDevices; i++)
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
dataset.NumBytes(), hipMemcpyDeviceToHost));
// Allocate temporary host array to accumulate results
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
uint8_t* resultU1 = (uint8_t *)resultI1;
int32_t* resultI4 = (int32_t *)resultI1;
uint32_t* resultU4 = (uint32_t *)resultI1;
int64_t* resultI8 = (int64_t *)resultI1;
uint64_t* resultU8 = (uint64_t *)resultI1;
float* resultF4 = (float *)resultI1;
double* resultF8 = (double *)resultI1;
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
// Initialize the result with the first device's array
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
ncclRedOp_t red_op = ((op == ncclAvg) ? ncclSum : op);
// Perform reduction on the other device arrays
for (int i = 1; i < dataset.numDevices; i++)
{
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
uint8_t* arrayU1 = (uint8_t *)arrayI1;
int32_t* arrayI4 = (int32_t *)arrayI1;
uint32_t* arrayU4 = (uint32_t *)arrayI1;
int64_t* arrayI8 = (int64_t *)arrayI1;
uint64_t* arrayU8 = (uint64_t *)arrayI1;
float* arrayF4 = (float *)arrayI1;
double* arrayF8 = (double *)arrayI1;
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
for (int j = 0; j < dataset.numElements; j++)
{
switch (dataset.dataType)
{
case ncclInt8: resultI1[j] = ReduceOp(red_op, resultI1[j], arrayI1[j]); break;
case ncclUint8: resultU1[j] = ReduceOp(red_op, resultU1[j], arrayU1[j]); break;
case ncclInt32: resultI4[j] = ReduceOp(red_op, resultI4[j], arrayI4[j]); break;
case ncclUint32: resultU4[j] = ReduceOp(red_op, resultU4[j], arrayU4[j]); break;
case ncclInt64: resultI8[j] = ReduceOp(red_op, resultI8[j], arrayI8[j]); break;
case ncclUint64: resultU8[j] = ReduceOp(red_op, resultU8[j], arrayU8[j]); break;
case ncclFloat32: resultF4[j] = ReduceOp(red_op, resultF4[j], arrayF4[j]); break;
case ncclFloat64: resultF8[j] = ReduceOp(red_op, resultF8[j], arrayF8[j]); break;
case ncclBfloat16: resultB2[j] = ReduceOp(red_op, resultB2[j], arrayB2[j]); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
}
}
if (op == ncclAvg)
Average(dataset, resultI1);
// Copy results into expected arrays
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
for (int i = 0; i < dataset.numDevices; i++)
HIP_CALL(hipMemcpy(dataset.expected[i], dataset.outputs[i],
dataset.NumBytes(), hipMemcpyDeviceToHost));
for (int i = 0; i < dataset.numDevices; i++)
memcpy((int8_t *)dataset.expected[i] + (i * byteCount),
resultI1 + (i * byteCount), byteCount);
free(resultI1);
}
};
}
#endif
@@ -1,61 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_ReduceScatterMultiProcess.hpp"
namespace CorrectnessTests
{
TEST_P(ReduceScatterMultiProcessCorrectnessTest, Correctness)
{
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollReduceScatter);
std::vector<int> pids(numDevices);
int gpu = -1;
for (int i = 0; i < numDevices; i++)
{
gpu++;
int pid = fork();
if (pid == 0)
{
bool pass;
TestReduceScatter(gpu, *dataset, pass);
TerminateChildProcess(pass);
}
else
{
pids[gpu] = pid;
}
}
ValidateProcesses(pids);
dataset->ReleaseRootProcess();
}
INSTANTIATE_TEST_SUITE_P(ReduceScatterMultiProcessCorrectnessSweep,
ReduceScatterMultiProcessCorrectnessTest,
testing::Combine(
// Reduction operator
testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(3072, 3145728),
// Number of devices
testing::Values(2,3,4,8),
// In-place or not
testing::Values(false, true),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
@@ -1,146 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_REDUCE_SCATTER_MULTI_PROCESS_HPP
#define TEST_REDUCE_SCATTER_MULTI_PROCESS_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class ReduceScatterMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, std::vector<int> const& ranks)
{
// Copy all inputs to expected arrays temporarily to perform reduction on host
for (int i = 0; i < ranks.size(); i++)
{
int rank = ranks[i];
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
dataset.NumBytes(), hipMemcpyDeviceToHost));
}
barrier.Wait();
// Have rank 0 do the expected calculation, then send results to other processes
int8_t* resultI1;
for (int h = 0; h < ranks.size(); h++)
{
int rank = ranks[h];
if (rank == 0)
{
// Allocate temporary host array to accumulate results
resultI1 = (int8_t *)malloc(dataset.NumBytes());
uint8_t* resultU1 = (uint8_t *)resultI1;
int32_t* resultI4 = (int32_t *)resultI1;
uint32_t* resultU4 = (uint32_t *)resultI1;
int64_t* resultI8 = (int64_t *)resultI1;
uint64_t* resultU8 = (uint64_t *)resultI1;
float* resultF4 = (float *)resultI1;
double* resultF8 = (double *)resultI1;
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
// Initialize the result with the first device's array
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
// Perform reduction on the other device arrays
for (int i = 1; i < dataset.numDevices; i++)
{
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
uint8_t* arrayU1 = (uint8_t *)arrayI1;
int32_t* arrayI4 = (int32_t *)arrayI1;
uint32_t* arrayU4 = (uint32_t *)arrayI1;
int64_t* arrayI8 = (int64_t *)arrayI1;
uint64_t* arrayU8 = (uint64_t *)arrayI1;
float* arrayF4 = (float *)arrayI1;
double* arrayF8 = (double *)arrayI1;
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
for (int j = 0; j < dataset.numElements; j++)
{
switch (dataset.dataType)
{
case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
default:
fprintf(stderr, "[ERROR] Unsupported datatype\n");
exit(0);
}
}
}
}
}
barrier.Wait();
// Copy results into expected arrays
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
for (int i = 0; i < ranks.size(); i++)
{
int rank = ranks[i];
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.outputs[rank],
dataset.NumBytes(), hipMemcpyDeviceToHost));
}
barrier.Wait();
for (int h = 0; h < ranks.size(); h++)
{
int rank = ranks[h];
if (rank == 0)
{
for (int i = 0; i < dataset.numDevices; i++)
memcpy((int8_t *)dataset.expected[i] + (i * byteCount),
resultI1 + (i * byteCount), byteCount);
free(resultI1);
}
}
}
void TestReduceScatter(int rank, Dataset& dataset, bool& pass)
{
// Prepare input / output / expected results
SetUpPerProcess(rank, ncclCollAllGather, comms[rank], streams[rank], dataset);
if (numDevices > numDevicesAvailable || numElements % numDevices != 0)
{
pass = true;
return;
}
Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
// Prepare input / output / expected results
FillDatasetWithPattern(dataset, rank);
ComputeExpectedResults(dataset, barrier, op, std::vector<int>(1, rank));
size_t const byteCount = dataset.NumBytes() / numDevices;
size_t const recvCount = dataset.numElements / numDevices;
// Launch the reduction (1 process per GPU)
ncclReduceScatter(dataset.inputs[rank],
(int8_t *)dataset.outputs[rank] + (rank * byteCount),
recvCount, dataType, op,
comms[rank], streams[rank]);
// Wait for reduction to complete
HIP_CALL(hipStreamSynchronize(streams[rank]));
// Check results
pass = ValidateResults(dataset, rank);
TearDownPerProcess(comms[rank], streams[rank]);
dataset.Release(rank);
}
};
}
#endif
-71
Näytä tiedosto
@@ -1,71 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_Scatter.hpp"
namespace CorrectnessTests
{
TEST_P(ScatterCorrectnessTest, Correctness)
{
if (numDevices > numDevicesAvailable) return;
// Allocate data
Dataset dataset;
dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollScatter);
// Test each possible root
for (int root = 0; root < numDevices; root++)
{
// Prepare input / output / expected results
FillDatasetWithPattern(dataset);
ComputeExpectedResults(dataset, root);
// Launch the reduction (1 thread per GPU)
ncclGroupStart();
for (int i = 0; i < numDevices; i++)
{
ncclScatter(dataset.inputs[i],
dataset.outputs[i],
numElements, dataType,
root, comms[i], streams[i]);
}
ncclGroupEnd();
// Wait for reduction to complete
Synchronize();
// Check results
ValidateResults(dataset);
}
dataset.Release();
}
INSTANTIATE_TEST_SUITE_P(ScatterCorrectnessSweep,
ScatterCorrectnessTest,
testing::Combine(
// Reduction operator is not used
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Range(2,(GTESTS_NUM_GPUS+1)),
// In-place or not
testing::Values(false),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-25
Näytä tiedosto
@@ -1,25 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_SCATTER_HPP
#define TEST_SCATTER_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class ScatterCorrectnessTest : public CorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, int const root)
{
for (int i = 0; i < dataset.numDevices; i++)
HIP_CALL(hipMemcpy(dataset.expected[i], (int8_t *)dataset.inputs[root]+dataset.NumBytes()*i,
dataset.NumBytes(), hipMemcpyDeviceToHost));
}
};
}
#endif
-61
Näytä tiedosto
@@ -1,61 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "test_ScatterMultiProcess.hpp"
namespace CorrectnessTests
{
TEST_P(ScatterMultiProcessCorrectnessTest, Correctness)
{
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollScatter);
std::vector<int> pids(numDevices);
int gpu = -1;
for (int i = 0; i < numDevices; i++)
{
gpu++;
int pid = fork();
if (pid == 0)
{
bool pass;
TestScatter(gpu, *dataset, pass);
TerminateChildProcess(pass);
}
else
{
pids[gpu] = pid;
}
}
ValidateProcesses(pids);
dataset->ReleaseRootProcess();
}
INSTANTIATE_TEST_SUITE_P(ScatterMultiProcessCorrectnessSweep,
ScatterMultiProcessCorrectnessTest,
testing::Combine(
// Reduction operator is not used
testing::Values(ncclSum),
// Data types
testing::Values(ncclInt8,
ncclUint8,
ncclInt32,
ncclUint32,
ncclInt64,
ncclUint64,
//ncclFloat16,
ncclFloat32,
ncclFloat64,
ncclBfloat16),
// Number of elements
testing::Values(1024, 1048576),
// Number of devices
testing::Values(2,3,4,8),
// In-place or not
testing::Values(false),
testing::Values("")),
CorrectnessTest::PrintToStringParamName());
} // namespace
-68
Näytä tiedosto
@@ -1,68 +0,0 @@
/*************************************************************************
* Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef TEST_SCATTER_MULTI_PROCESS_HPP
#define TEST_SCATTER_MULTI_PROCESS_HPP
#include "CorrectnessTest.hpp"
namespace CorrectnessTests
{
class ScatterMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
{
public:
static void ComputeExpectedResults(Dataset& dataset, int const root, int const rank)
{
if (rank == root)
{
for (int i = 0; i < dataset.numDevices; i++)
HIP_CALL(hipMemcpy(dataset.expected[i], (int8_t *)dataset.inputs[root]+dataset.NumBytes()*i,
dataset.NumBytes(), hipMemcpyDeviceToHost));
}
}
void TestScatter(int rank, Dataset& dataset, bool& pass)
{
// Prepare input / output / expected results
SetUpPerProcess(rank, ncclCollScatter, comms[rank], streams[rank], dataset);
if (numDevices > numDevicesAvailable)
{
pass = true;
return;
}
Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
// Test each possible root
for (int root = 0; root < numDevices; root++)
{
// Prepare input / output / expected results
FillDatasetWithPattern(dataset, rank);
ComputeExpectedResults(dataset, root, rank);
// Launch the reduction (1 process per GPU)
ncclScatter(dataset.inputs[rank],
dataset.outputs[rank],
numElements, dataType,
root, comms[rank], streams[rank]);
// Wait for reduction to complete
HIP_CALL(hipStreamSynchronize(streams[rank]));
// Check results
pass = ValidateResults(dataset, rank);
barrier.Wait();
}
TearDownPerProcess(comms[rank], streams[rank]);
dataset.Release(rank);
}
};
}
#endif