From 29ad0f5fbe42114ac218be92900b2a11afe47bca Mon Sep 17 00:00:00 2001 From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com> Date: Fri, 25 Feb 2022 08:59:07 -0700 Subject: [PATCH] Unit test refactor (#500) Refactoring and consolidating single-process / multi-process unit testing --- .jenkins/common.groovy | 2 +- .jenkins/precheckin.groovy | 20 +- install.sh | 2 +- src/clique/CliqueManager.cc | 16 +- src/clique/CliqueManager.h | 14 +- test/AllGather_InPlace.cpp | 26 + test/AllGather_ManagedMem.cpp | 26 + test/AllGather_OutOfPlace.cpp | 26 + test/AllReduce_Clique.cpp | 31 + test/AllReduce_GroupCall.cpp | 63 ++ test/AllReduce_InPlace.cpp | 26 + test/AllReduce_ManagedMem.cpp | 26 + test/AllReduce_OutOfPlace.cpp | 26 + test/AllReduce_PreMultScalar.cpp | 74 ++ test/AllToAll_ManagedMem.cpp | 26 + test/AllToAll_OutOfPlace.cpp | 26 + test/Broadcast_InPlace.cpp | 26 + test/Broadcast_ManagedMem.cpp | 26 + test/Broadcast_OutOfPlace.cpp | 26 + test/CMakeLists.txt | 138 +-- test/CorrectnessTest.hpp | 1200 ---------------------- test/Gather_InPlace.cpp | 26 + test/Gather_ManagedMem.cpp | 26 + test/Gather_OutOfPlace.cpp | 26 + test/ReduceScatter_InPlace.cpp | 26 + test/ReduceScatter_ManagedMem.cpp | 26 + test/ReduceScatter_OutOfPlace.cpp | 26 + test/Reduce_InPlace.cpp | 26 + test/Reduce_ManagedMem.cpp | 26 + test/Reduce_OutOfPlace.cpp | 26 + test/Scatter_InPlace.cpp | 26 + test/Scatter_ManagedMem.cpp | 26 + test/Scatter_OutOfPlace.cpp | 26 + test/TestChecks.hpp | 63 -- test/common/CollectiveArgs.cpp | 282 +++++ test/common/CollectiveArgs.hpp | 151 +++ test/common/EnvVars.cpp | 161 +++ test/common/EnvVars.hpp | 44 + test/common/ErrCode.hpp | 38 + test/common/PrepDataFuncs.cpp | 342 ++++++ test/common/PrepDataFuncs.hpp | 26 + test/common/PtrUnion.cpp | 354 +++++++ test/common/PtrUnion.hpp | 90 ++ test/common/TestBed.cpp | 485 +++++++++ test/common/TestBed.hpp | 129 +++ test/common/TestBedChild.cpp | 589 +++++++++++ test/common/TestBedChild.hpp | 106 ++ test/common/main.cpp | 11 + test/test_AllGather.cpp | 117 --- test/test_AllGather.hpp | 34 - test/test_AllGatherMultiProcess.cpp | 60 -- test/test_AllGatherMultiProcess.hpp | 81 -- test/test_AllReduce.cpp | 80 -- test/test_AllReduce.hpp | 83 -- test/test_AllReduceAbort.cpp | 138 --- test/test_AllReduceAbort.hpp | 20 - test/test_AllReduceGroup.cpp | 83 -- test/test_AllReduceGroup.hpp | 79 -- test/test_AllReduceGroupMultiProcess.cpp | 82 -- test/test_AllReduceGroupMultiProcess.hpp | 105 -- test/test_AllReduceMultiProcess.cpp | 61 -- test/test_AllReduceMultiProcess.hpp | 117 --- test/test_AllToAll.cpp | 67 -- test/test_AllToAll.hpp | 26 - test/test_AllToAllMultiProcess.cpp | 61 -- test/test_AllToAllMultiProcess.hpp | 61 -- test/test_AllToAllv.cpp | 75 -- test/test_AllToAllv.hpp | 44 - test/test_Broadcast.cpp | 71 -- test/test_Broadcast.hpp | 25 - test/test_BroadcastAbort.cpp | 140 --- test/test_BroadcastAbort.hpp | 20 - test/test_BroadcastMultiProcess.cpp | 68 -- test/test_BroadcastMultiProcess.hpp | 77 -- test/test_CombinedCalls.cpp | 129 --- test/test_CombinedCalls.hpp | 17 - test/test_CombinedCallsMultiProcess.cpp | 81 -- test/test_CombinedCallsMultiProcess.hpp | 97 -- test/test_Gather.cpp | 71 -- test/test_Gather.hpp | 25 - test/test_GatherMultiProcess.cpp | 61 -- test/test_GatherMultiProcess.hpp | 63 -- test/test_GroupCalls.cpp | 130 --- test/test_GroupCalls.hpp | 17 - test/test_GroupCallsMultiProcess.cpp | 92 -- test/test_GroupCallsMultiProcess.hpp | 148 --- test/test_Reduce.cpp | 71 -- test/test_Reduce.hpp | 87 -- test/test_ReduceMultiProcess.cpp | 61 -- test/test_ReduceMultiProcess.hpp | 131 --- test/test_ReduceScatter.cpp | 71 -- test/test_ReduceScatter.hpp | 90 -- test/test_ReduceScatterMultiProcess.cpp | 61 -- test/test_ReduceScatterMultiProcess.hpp | 146 --- test/test_Scatter.cpp | 71 -- test/test_Scatter.hpp | 25 - test/test_ScatterMultiProcess.cpp | 61 -- test/test_ScatterMultiProcess.hpp | 68 -- 98 files changed, 3684 insertions(+), 5094 deletions(-) create mode 100644 test/AllGather_InPlace.cpp create mode 100644 test/AllGather_ManagedMem.cpp create mode 100644 test/AllGather_OutOfPlace.cpp create mode 100644 test/AllReduce_Clique.cpp create mode 100644 test/AllReduce_GroupCall.cpp create mode 100644 test/AllReduce_InPlace.cpp create mode 100644 test/AllReduce_ManagedMem.cpp create mode 100644 test/AllReduce_OutOfPlace.cpp create mode 100644 test/AllReduce_PreMultScalar.cpp create mode 100644 test/AllToAll_ManagedMem.cpp create mode 100644 test/AllToAll_OutOfPlace.cpp create mode 100644 test/Broadcast_InPlace.cpp create mode 100644 test/Broadcast_ManagedMem.cpp create mode 100644 test/Broadcast_OutOfPlace.cpp delete mode 100644 test/CorrectnessTest.hpp create mode 100644 test/Gather_InPlace.cpp create mode 100644 test/Gather_ManagedMem.cpp create mode 100644 test/Gather_OutOfPlace.cpp create mode 100644 test/ReduceScatter_InPlace.cpp create mode 100644 test/ReduceScatter_ManagedMem.cpp create mode 100644 test/ReduceScatter_OutOfPlace.cpp create mode 100644 test/Reduce_InPlace.cpp create mode 100644 test/Reduce_ManagedMem.cpp create mode 100644 test/Reduce_OutOfPlace.cpp create mode 100644 test/Scatter_InPlace.cpp create mode 100644 test/Scatter_ManagedMem.cpp create mode 100644 test/Scatter_OutOfPlace.cpp delete mode 100644 test/TestChecks.hpp create mode 100644 test/common/CollectiveArgs.cpp create mode 100644 test/common/CollectiveArgs.hpp create mode 100644 test/common/EnvVars.cpp create mode 100644 test/common/EnvVars.hpp create mode 100644 test/common/ErrCode.hpp create mode 100644 test/common/PrepDataFuncs.cpp create mode 100644 test/common/PrepDataFuncs.hpp create mode 100644 test/common/PtrUnion.cpp create mode 100644 test/common/PtrUnion.hpp create mode 100644 test/common/TestBed.cpp create mode 100644 test/common/TestBed.hpp create mode 100644 test/common/TestBedChild.cpp create mode 100644 test/common/TestBedChild.hpp create mode 100644 test/common/main.cpp delete mode 100644 test/test_AllGather.cpp delete mode 100644 test/test_AllGather.hpp delete mode 100644 test/test_AllGatherMultiProcess.cpp delete mode 100644 test/test_AllGatherMultiProcess.hpp delete mode 100644 test/test_AllReduce.cpp delete mode 100644 test/test_AllReduce.hpp delete mode 100644 test/test_AllReduceAbort.cpp delete mode 100644 test/test_AllReduceAbort.hpp delete mode 100644 test/test_AllReduceGroup.cpp delete mode 100644 test/test_AllReduceGroup.hpp delete mode 100644 test/test_AllReduceGroupMultiProcess.cpp delete mode 100644 test/test_AllReduceGroupMultiProcess.hpp delete mode 100644 test/test_AllReduceMultiProcess.cpp delete mode 100644 test/test_AllReduceMultiProcess.hpp delete mode 100644 test/test_AllToAll.cpp delete mode 100644 test/test_AllToAll.hpp delete mode 100644 test/test_AllToAllMultiProcess.cpp delete mode 100644 test/test_AllToAllMultiProcess.hpp delete mode 100644 test/test_AllToAllv.cpp delete mode 100644 test/test_AllToAllv.hpp delete mode 100644 test/test_Broadcast.cpp delete mode 100644 test/test_Broadcast.hpp delete mode 100644 test/test_BroadcastAbort.cpp delete mode 100644 test/test_BroadcastAbort.hpp delete mode 100644 test/test_BroadcastMultiProcess.cpp delete mode 100644 test/test_BroadcastMultiProcess.hpp delete mode 100644 test/test_CombinedCalls.cpp delete mode 100644 test/test_CombinedCalls.hpp delete mode 100644 test/test_CombinedCallsMultiProcess.cpp delete mode 100644 test/test_CombinedCallsMultiProcess.hpp delete mode 100644 test/test_Gather.cpp delete mode 100644 test/test_Gather.hpp delete mode 100644 test/test_GatherMultiProcess.cpp delete mode 100644 test/test_GatherMultiProcess.hpp delete mode 100644 test/test_GroupCalls.cpp delete mode 100644 test/test_GroupCalls.hpp delete mode 100644 test/test_GroupCallsMultiProcess.cpp delete mode 100644 test/test_GroupCallsMultiProcess.hpp delete mode 100644 test/test_Reduce.cpp delete mode 100644 test/test_Reduce.hpp delete mode 100644 test/test_ReduceMultiProcess.cpp delete mode 100644 test/test_ReduceMultiProcess.hpp delete mode 100644 test/test_ReduceScatter.cpp delete mode 100644 test/test_ReduceScatter.hpp delete mode 100644 test/test_ReduceScatterMultiProcess.cpp delete mode 100644 test/test_ReduceScatterMultiProcess.hpp delete mode 100644 test/test_Scatter.cpp delete mode 100644 test/test_Scatter.hpp delete mode 100644 test/test_ScatterMultiProcess.cpp delete mode 100644 test/test_ScatterMultiProcess.hpp diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy index c1cbfe96a8..941f141025 100644 --- a/.jenkins/common.groovy +++ b/.jenkins/common.groovy @@ -21,7 +21,7 @@ def runTestCommand (platform, project, gfilter) def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix}/build/release/test - ${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes + ${sudo} UT_SHOW_NAMES=1 HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes """ platform.runCommand(this, command) diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy index 5f5eae0ba0..0483369dac 100644 --- a/.jenkins/precheckin.groovy +++ b/.jenkins/precheckin.groovy @@ -9,12 +9,12 @@ import com.amd.project.* import com.amd.docker.* import java.nio.file.Path -def runCI = +def runCI = { nodeDetails, jobName-> def prj = new rocProject('rccl', 'PreCheckin') - + prj.timeout.test = 1440 prj.paths.build_command = './install.sh -t ' @@ -32,25 +32,25 @@ def runCI = commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName) } - + def testCommand = { platform, project-> - commonGroovy.runTestCommand(platform, project, "*sum_float32*") + commonGroovy.runTestCommand(platform, project, "*") } def packageCommand = { platform, project-> - + commonGroovy.runPackageCommand(platform, project, jobName) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } -ci: { +ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] @@ -58,17 +58,17 @@ ci: { propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([sles15sp1:['4gfx906'],centos8:['8gfx908'],centos7:['8gfx906'],ubuntu18:['4gfx906', '4gfx908']])] - + jobNameList = auxiliary.appendJobNameList(jobNameList) - - propertyList.each + + propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } - jobNameList.each + jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) diff --git a/install.sh b/install.sh index 5caf182068..6de243c401 100755 --- a/install.sh +++ b/install.sh @@ -229,7 +229,7 @@ if ($run_tests); then if ($run_tests_all); then ./test/UnitTests else - ./test/UnitTests --gtest_filter="BroadcastCorrectnessSweep*:*float32*" + ./test/UnitTests --gtest_filter="AllReduce.*" fi else echo "Unit tests have not been built yet; please re-run script with -t to build unit tests." diff --git a/src/clique/CliqueManager.cc b/src/clique/CliqueManager.cc index 1a6560c4d4..11b910740c 100644 --- a/src/clique/CliqueManager.cc +++ b/src/clique/CliqueManager.cc @@ -96,6 +96,7 @@ void CliqueManager::CleanUp() if (m_cliqueMode == CLIQUE_SINGLE_NODE) { // Release caches + INFO(NCCL_COLL, "Rank %d deleting IPC caches", m_rank); if (m_ipcHandleSendCache) delete m_ipcHandleSendCache; if (m_ipcHandleRecvCache) delete m_ipcHandleRecvCache; @@ -494,18 +495,24 @@ ncclResult_t CliqueManager::CheckCacheForPtr(void* devPtr, uint64_t realAddr = (uint64_t)devPtr; handlePair->second = realAddr - baseAddr; + CUDACHECK(hipIpcGetMemHandle(&handlePair->first, (void*)baseAddr)); + + /* Disabling cache until proper deallocation methods are available // IPC handles are only supported for base address pointers NcclIpcHandleSendCache::iterator it = cache->find(baseAddr); if (it == cache->end()) { + INFO(NCCL_COLL, "Rank %d searching IPC handle cache for %p (not found)", rank, devPtr); CUDACHECK(hipIpcGetMemHandle(&handlePair->first, (void*)baseAddr)); cache->insert(baseAddr, handlePair->first); } else { + INFO(NCCL_COLL, "Rank %d searching IPC handle cache for %p (found!)", rank, devPtr); handlePair->first = (it->second).first; } + */ return ncclSuccess; } @@ -513,10 +520,16 @@ ncclResult_t CliqueManager::CheckCacheForHandle(std::pairfind(handlePair.first); // Get base address pointer from cache if it exists - void* baseAddr; + if (it == cache->end()) { CUDACHECK(hipIpcOpenMemHandle(&baseAddr, handlePair.first, hipIpcMemLazyEnablePeerAccess)); @@ -526,6 +539,7 @@ ncclResult_t CliqueManager::CheckCacheForHandle(std::pairsecond).first; } + */ // Modify base address pointer with offset uint64_t realAddr = (uint64_t)baseAddr + handlePair.second; diff --git a/src/clique/CliqueManager.h b/src/clique/CliqueManager.h index 2fbef06319..caf2bdc8bd 100644 --- a/src/clique/CliqueManager.h +++ b/src/clique/CliqueManager.h @@ -79,14 +79,14 @@ public: static ncclResult_t BootstrapRootInit(int pid, unsigned long hash); protected: - static ncclResult_t CheckCacheForPtr(void* devPtr, - NcclIpcHandleSendCache* cache, - int rank, - std::pair* handlePair); + ncclResult_t CheckCacheForPtr(void* devPtr, + NcclIpcHandleSendCache* cache, + int rank, + std::pair* handlePair); - static ncclResult_t CheckCacheForHandle(std::pair const& handlePair, - NcclIpcHandleRecvCache* cache, - void** ptr); + ncclResult_t CheckCacheForHandle(std::pair const& handlePair, + NcclIpcHandleRecvCache* cache, + void** ptr); int m_rank; // Associated rank int m_numRanks; // Total number of ranks diff --git a/test/AllGather_InPlace.cpp b/test/AllGather_InPlace.cpp new file mode 100644 index 0000000000..773554883f --- /dev/null +++ b/test/AllGather_InPlace.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(AllGather, InPlace) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollAllGather}; + std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {true}; + std::vector const managedMemList = {false}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/AllGather_ManagedMem.cpp b/test/AllGather_ManagedMem.cpp new file mode 100644 index 0000000000..8ff753d619 --- /dev/null +++ b/test/AllGather_ManagedMem.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(AllGather, ManagedMem) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollAllGather}; + std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {true}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/AllGather_OutOfPlace.cpp b/test/AllGather_OutOfPlace.cpp new file mode 100644 index 0000000000..2dcc178683 --- /dev/null +++ b/test/AllGather_OutOfPlace.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(AllGather, OutOfPlace) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollAllGather}; + std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/AllReduce_Clique.cpp b/test/AllReduce_Clique.cpp new file mode 100644 index 0000000000..59f53867c5 --- /dev/null +++ b/test/AllReduce_Clique.cpp @@ -0,0 +1,31 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" +#include +namespace RcclUnitTesting +{ + TEST(AllReduce, Clique) + { + // Set clique env var prior to TestBed + setenv("RCCL_ENABLE_CLIQUE", "1", 1); + + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollAllReduce}; + std::vector const dataTypes = testBed.GetAllSupportedDataTypes(); + std::vector const redOps = testBed.GetAllSupportedRedOps(); + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false, true}; + std::vector const managedMemList = {false}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + + unsetenv("RCCL_ENABLE_CLIQUE"); + } +} diff --git a/test/AllReduce_GroupCall.cpp b/test/AllReduce_GroupCall.cpp new file mode 100644 index 0000000000..5b2ad88030 --- /dev/null +++ b/test/AllReduce_GroupCall.cpp @@ -0,0 +1,63 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(AllReduce, GroupCall) + { + TestBed testBed; + + // Configuration + ncclFunc_t const funcType = ncclCollAllReduce; + std::vector const& dataTypes = {ncclFloat}; + std::vector const& redOps = {ncclSum}; + std::vector const numElements = {1048576, 53327, 1024}; + int const root = 0; + bool const inPlace = false; + bool const useManagedMem = false; + int const numCollPerGroup = numElements.size(); + + // This tests runs 3 collectives in the same group call + bool isCorrect = true; + for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks) + for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess) + { + // Test either single process all GPUs, or 1 process per GPU + int const numProcesses = isMultiProcess ? totalRanks : 1; + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup); + + for (int redOpIdx = 0; redOpIdx < redOps.size() && isCorrect; ++redOpIdx) + for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) + { + if (testBed.ev.showNames) + INFO("%s process %2d-ranks AllReduce %d Grouped Calls (%s-%s)\n", + isMultiProcess ? "Multi " : "Single", + totalRanks, numCollPerGroup, + ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]); + + // Run all element sizes in parallel as single group + for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx) + { + testBed.SetCollectiveArgs(funcType, + dataTypes[dataIdx], + redOps[redOpIdx], + root, + numElements[collIdx], + numElements[collIdx], + collIdx); + } + testBed.AllocateMem(inPlace, useManagedMem); + testBed.PrepareData(); + testBed.ExecuteCollectives(); + testBed.ValidateResults(isCorrect); + testBed.DeallocateMem(); + } + testBed.DestroyComms(); + } + testBed.Finalize(); + } +} diff --git a/test/AllReduce_InPlace.cpp b/test/AllReduce_InPlace.cpp new file mode 100644 index 0000000000..4a3e61f4bf --- /dev/null +++ b/test/AllReduce_InPlace.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(AllReduce, InPlace) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollAllReduce}; + std::vector const dataTypes = {ncclInt8, ncclInt32, ncclFloat32}; + std::vector const redOps = {ncclSum, ncclProd}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {true}; + std::vector const managedMemList = {false}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/AllReduce_ManagedMem.cpp b/test/AllReduce_ManagedMem.cpp new file mode 100644 index 0000000000..f5019df88a --- /dev/null +++ b/test/AllReduce_ManagedMem.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(AllReduce, ManagedMem) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollAllReduce}; + std::vector const dataTypes = {ncclFloat32, ncclUint8, ncclUint64}; + std::vector const redOps = {ncclSum, ncclMax}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {true}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/AllReduce_OutOfPlace.cpp b/test/AllReduce_OutOfPlace.cpp new file mode 100644 index 0000000000..660d5eb1b8 --- /dev/null +++ b/test/AllReduce_OutOfPlace.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(AllReduce, OutOfPlace) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollAllReduce}; + std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; + std::vector const redOps = {ncclSum, ncclMin}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/AllReduce_PreMultScalar.cpp b/test/AllReduce_PreMultScalar.cpp new file mode 100644 index 0000000000..2674fa7a5e --- /dev/null +++ b/test/AllReduce_PreMultScalar.cpp @@ -0,0 +1,74 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + // This tests using custom pre-mult scalars reductions + TEST(AllReduce, PreMultScalar) + { + TestBed testBed; + + // Configuration + ncclFunc_t const funcType = ncclCollAllReduce; + std::vector const& dataTypes = {ncclInt32, ncclFloat32, ncclFloat64}; + ncclRedOp_t const redOp = ncclSum; + std::vector const numElements = {1048576, 1024}; + int const root = 0; + bool const inPlace = false; + bool const useManagedMem = false; + + // Terminate the test as soon as first failure occurs + bool isCorrect = true; + for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks) + for (int isMultiProcess = 0; isMultiProcess <= 1; ++isMultiProcess) + { + int const numProcesses = isMultiProcess ? totalRanks : 1; + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks)); + + for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) + { + ncclDataType_t const dataType = dataTypes[dataIdx]; + + // Set scalars per rank + PtrUnion scalarsPerRank; + scalarsPerRank.AllocateCpuMem(totalRanks * DataTypeToBytes(dataType)); + for (int i = 0; i < totalRanks; i++) + { + double F = i; + scalarsPerRank.Set(dataType, i, i, F); + } + + // Test various scalar residence modes + for (int scalarMode = 0; scalarMode <= 1 && isCorrect; ++scalarMode) + { + if (testBed.ev.showNames) + INFO("%s process %2d-ranks AllReduce (custom-scalar Mode %d %s)\n", + isMultiProcess ? "Multi " : "Single", + totalRanks, scalarMode, ncclDataTypeNames[dataType]); + + for (int i = 0; i < numElements.size() && isCorrect; ++i) + { + testBed.SetCollectiveArgs(funcType, dataType, redOp, root, + numElements[i], numElements[i], + -1, -1, scalarsPerRank, scalarMode); + // For performance, only allocate and prepare data on largest size + if (i == 0) + { + testBed.AllocateMem(inPlace, useManagedMem); + testBed.PrepareData(); + } + testBed.ExecuteCollectives(); + testBed.ValidateResults(isCorrect); + } + testBed.DeallocateMem(); + } + } + testBed.DestroyComms(); + } + testBed.Finalize(); + } +} diff --git a/test/AllToAll_ManagedMem.cpp b/test/AllToAll_ManagedMem.cpp new file mode 100644 index 0000000000..041be9cf8a --- /dev/null +++ b/test/AllToAll_ManagedMem.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(AllToAll, ManagedMem) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollAllToAll}; + std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {true}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/AllToAll_OutOfPlace.cpp b/test/AllToAll_OutOfPlace.cpp new file mode 100644 index 0000000000..3b3153afae --- /dev/null +++ b/test/AllToAll_OutOfPlace.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(AllToAll, OutOfPlace) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollAllToAll}; + std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/Broadcast_InPlace.cpp b/test/Broadcast_InPlace.cpp new file mode 100644 index 0000000000..e10dd961e4 --- /dev/null +++ b/test/Broadcast_InPlace.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(Broadcast, InPlace) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollBroadcast}; + std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {true}; + std::vector const managedMemList = {false}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/Broadcast_ManagedMem.cpp b/test/Broadcast_ManagedMem.cpp new file mode 100644 index 0000000000..1f6695c102 --- /dev/null +++ b/test/Broadcast_ManagedMem.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(Broadcast, ManagedMem) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollBroadcast}; + std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {true}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/Broadcast_OutOfPlace.cpp b/test/Broadcast_OutOfPlace.cpp new file mode 100644 index 0000000000..ffb01fff55 --- /dev/null +++ b/test/Broadcast_OutOfPlace.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(Broadcast, OutOfPlace) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollBroadcast}; + std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {1}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 7da42fb9e8..06e1f63728 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -3,87 +3,103 @@ cmake_minimum_required(VERSION 2.8.12) if(BUILD_TESTS) - message("Going to build unit tests (Installed in /test/UnitTests)") + message("Building unit tests (Installed in /test/UnitTests)") find_program(CHRPATH chrpath) if(NOT CHRPATH) message(FATAL_ERROR "chrpath is required for UnitTests. Please install (e.g. sudo apt-get install chrpath)") endif() - include_directories(${GTEST_INCLUDE_DIRS}) + find_package(hsa-runtime64 PATHS /opt/rocm ) + if(${hsa-runtime64_FOUND}) + message("hsa-runtime64 found @ ${hsa-runtime64_DIR} ") + else() + message("find_package did NOT find hsa-runtime64, finding it the OLD Way") + message("Looking for header files in ${ROCR_INC_DIR}") + message("Looking for library files in ${ROCR_LIB_DIR}") + # Search for ROCr header file in user defined locations + find_path(ROCR_HDR hsa.h PATHS ${ROCR_INC_DIR} "/opt/rocm" PATH_SUFFIXES include/hsa REQUIRED) + INCLUDE_DIRECTORIES(${ROCR_HDR}) + + # Search for ROCr library file in user defined locations + find_library(ROCR_LIB ${CORE_RUNTIME_TARGET} PATHS ${ROCR_LIB_DIR} "/opt/rocm" PATH_SUFFIXES lib lib64 REQUIRED) + endif() + + include_directories(${GTEST_INCLUDE_DIRS} ./common) + + # Collect testing framework source files + set (COMMON_SOURCE_FILES + common/main.cpp + common/CollectiveArgs.cpp + common/EnvVars.cpp + common/PrepDataFuncs.cpp + common/PtrUnion.cpp + common/TestBed.cpp + common/TestBedChild.cpp + ) + + # Collect source files for tests if(BUILD_ALLREDUCE_ONLY) - set(TEST_SOURCES_SINGLE_PROCESS - test_AllReduce.cpp - test_AllReduceAbort.cpp - test_AllReduceGroup.cpp + set(TEST_SOURCE_FILES + AllReduce_Clique.cpp + AllReduce_GroupCall.cpp + AllReduce_InPlace.cpp + AllReduce_ManagedMem.cpp + AllReduce_OutOfPlace.cpp + AllReduce_PreMultScalar.cpp ) else() - # Collect source files for tests - set(TEST_SOURCES_SINGLE_PROCESS - test_AllGather.cpp - test_AllReduce.cpp - test_AllReduceGroup.cpp - test_Broadcast.cpp - test_Reduce.cpp - test_ReduceScatter.cpp - test_GroupCalls.cpp - test_CombinedCalls.cpp - test_AllReduceAbort.cpp - test_BroadcastAbort.cpp - test_Scatter.cpp - test_Gather.cpp - test_AllToAll.cpp - test_AllToAllv.cpp - ) + set(TEST_SOURCE_FILES + #AllReduce + AllReduce_Clique.cpp + AllReduce_GroupCall.cpp + AllReduce_InPlace.cpp + AllReduce_ManagedMem.cpp + AllReduce_OutOfPlace.cpp + AllReduce_PreMultScalar.cpp + #AllGather + AllGather_InPlace.cpp + AllGather_ManagedMem.cpp + AllGather_OutOfPlace.cpp + #AllToAll + AllToAll_OutOfPlace.cpp + AllToAll_ManagedMem.cpp + #Broadcast + Broadcast_InPlace.cpp + Broadcast_ManagedMem.cpp + Broadcast_OutOfPlace.cpp + #Reduce + Reduce_InPlace.cpp + Reduce_ManagedMem.cpp + Reduce_OutOfPlace.cpp + #ReduceScatter + ReduceScatter_InPlace.cpp + ReduceScatter_ManagedMem.cpp + ReduceScatter_OutOfPlace.cpp + #Scatter + Scatter_InPlace.cpp + Scatter_ManagedMem.cpp + Scatter_OutOfPlace.cpp + #Gather + Gather_InPlace.cpp + Gather_ManagedMem.cpp + Gather_OutOfPlace.cpp + ) endif() - if(BUILD_ALLREDUCE_ONLY) - set(TEST_SOURCES_MULTI_PROCESS - test_AllReduceMultiProcess.cpp - test_AllReduceGroupMultiProcess.cpp - ) - else() - set(TEST_SOURCES_MULTI_PROCESS - test_AllGatherMultiProcess.cpp - test_AllReduceMultiProcess.cpp - test_AllReduceGroupMultiProcess.cpp - test_AllToAllMultiProcess.cpp - test_BroadcastMultiProcess.cpp - test_CombinedCallsMultiProcess.cpp - test_GatherMultiProcess.cpp - test_GroupCallsMultiProcess.cpp - test_ReduceMultiProcess.cpp - test_ReduceScatterMultiProcess.cpp - test_ScatterMultiProcess.cpp - ) - endif() - add_executable(UnitTests ${TEST_SOURCES_SINGLE_PROCESS}) + + add_executable(UnitTests ${COMMON_SOURCE_FILES} ${TEST_SOURCE_FILES}) target_include_directories(UnitTests PRIVATE ${ROCM_PATH} ${GTEST_INCLUDE_DIRS}) target_link_libraries(UnitTests PRIVATE ${GTEST_BOTH_LIBRARIES}) - target_link_libraries(UnitTests PRIVATE hip::host hip::device) - - add_executable(UnitTestsMultiProcess ${TEST_SOURCES_MULTI_PROCESS}) - target_include_directories(UnitTestsMultiProcess PRIVATE ${ROCM_PATH} ${GTEST_INCLUDE_DIRS}) - target_link_libraries(UnitTestsMultiProcess PRIVATE ${GTEST_BOTH_LIBRARIES}) - target_link_libraries(UnitTestsMultiProcess PRIVATE hip::host hip::device) - - find_program( rocminfo_executable rocminfo ) - execute_process(COMMAND bash "-c" "${rocminfo_executable} | grep 'Device Type' | grep GPU | wc -l | tr -d '\n'" OUTPUT_VARIABLE gtest_num_gpus) - if(${gtest_num_gpus} EQUAL "0" OR ${gtest_num_gpus} EQUAL "1") - set(gtest_num_gpus "2") - endif() - target_compile_options(UnitTests PRIVATE -DGTESTS_NUM_GPUS=${gtest_num_gpus}) + target_link_libraries(UnitTests PRIVATE hip::host hip::device hsa-runtime64::hsa-runtime64) # UnitTests using static library of rccl requires passing rccl # through -l and -L instead of command line input. if(BUILD_STATIC) add_dependencies(UnitTests rccl) target_link_libraries(UnitTests PRIVATE dl rt numa -lrccl -L${CMAKE_BINARY_DIR} -lrocm_smi64 -L${ROCM_PATH}/rocm_smi/lib) - add_dependencies(UnitTestsMultiProcess rccl) - target_link_libraries(UnitTestsMultiProcess PRIVATE dl rt numa -lrccl -L${CMAKE_BINARY_DIR} -lrocm_smi64 -L${ROCM_PATH}/rocm_smi/lib) else() target_link_libraries(UnitTests PRIVATE rccl) - target_link_libraries(UnitTestsMultiProcess PRIVATE rt rccl) endif() # HIPCC adds /opt/rocm/lib as RPATH, even though the install process is supposed to # remove RPATH. It also occurs before any user-specified rpath, which effectively overrides the user rpath. @@ -91,10 +107,8 @@ if(BUILD_TESTS) if (CMAKE_INSTALL_PREFIX MATCHES "${ROCM_PATH}") # install_prefix/CMAKE_INSTALL_PREFIX was not explicitly specified, so look in build/release add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:${ROCM_PATH}/lib ${CMAKE_BINARY_DIR}/test/UnitTests) - add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:${ROCM_PATH}/lib ${CMAKE_BINARY_DIR}/test/UnitTestsMultiProcess) else() add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTests) - add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTestsMultiProcess) endif() else() message("Not building unit tests") diff --git a/test/CorrectnessTest.hpp b/test/CorrectnessTest.hpp deleted file mode 100644 index 7bd6dbe0e2..0000000000 --- a/test/CorrectnessTest.hpp +++ /dev/null @@ -1,1200 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef CORRECTNESSTEST_HPP -#define CORRECTNESSTEST_HPP - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "rccl.h" -#include "../include/rccl_bfloat16.h" - -#include "TestChecks.hpp" - -#define MAX_ENV_TOKENS 16 - -namespace CorrectnessTests -{ - typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollGather, ncclCollScatter, ncclCollAllToAll, ncclCollSendRecv } ncclFunc_t; - typedef enum { ncclInputBuffer, ncclOutputBuffer } ncclBufferType_t; - - // Performs the various basic reduction operations - template - T ReduceOp(ncclRedOp_t const op, T const A, T const B) - { - switch (op) - { - case ncclSum: return A + B; - case ncclProd: return A * B; - case ncclMax: return std::max(A, B); - case ncclMin: return std::min(A, B); - default: - fprintf(stderr, "[ERROR] Unsupported reduction operator (%d)\n", op); - exit(0); - } - } - - // Returns the number of bytes per element for each supported datatype - static int DataTypeToBytes(ncclDataType_t const dataType) - { - switch (dataType) - { - case ncclInt8: return 1; - case ncclUint8: return 1; - case ncclInt32: return 4; - case ncclUint32: return 4; - case ncclInt64: return 8; - case ncclUint64: return 8; - case ncclFloat16: return 2; - case ncclFloat32: return 4; - case ncclFloat64: return 8; - case ncclBfloat16: return 2; - default: - fprintf(stderr, "[ERROR] Unsupported datatype (%d)\n", dataType); - exit(0); - } - } - - // Encapsulates all the memory used per devices for collectives, as well as reference results - struct Dataset - { - int numDevices; // Number of devices participating - size_t numElements; // Number of elements per array - ncclDataType_t dataType; // Data type of each input/output pointer - bool inPlace; // Whether or not output pointers are same as input pointers - ncclFunc_t function; // Buffer sizes are different in case of gather, scatter and all to all - - std::vector inputs; // Input pointers (1 per device) - std::vector outputs; // Output pointers (1 per device) - // May be identical to input pointers for in-place tests - std::vector expected; // Expected output (1 per device) - - size_t NumBytes() const - { - return numElements * DataTypeToBytes(dataType); - } - - size_t NumBytes(ncclBufferType_t bufferType) const - { - if ((function == ncclCollGather && (bufferType == ncclOutputBuffer || inPlace == true)) || - (function == ncclCollScatter && bufferType == ncclInputBuffer) || - function == ncclCollAllToAll) - return numElements * DataTypeToBytes(dataType) * numDevices; - return numElements * DataTypeToBytes(dataType); - } - - // Checks if the current HIP Runtime and GPU support managed memory - bool SupportsHmm() - { - hipDeviceProp_t device_prop; - int device_id; - hipGetDevice(&device_id); - hipGetDeviceProperties(&device_prop, device_id); - if (device_prop.managedMemory == 1) return true; - - return false; - } - - // Check if user has opted-in to use managed memory - static bool UseHmm() - { - if (getenv("RCCL_USE_HMM") == nullptr) - { - return false; - } - - if (strcmp(getenv("RCCL_USE_HMM"), "1") == 0) - { - return true; - } - return false; - } - - // Helper for HMM allocations: if device supports managedMemory, and HMM is requested through - // RCCL_USE_HMM environment variable - template - hipError_t hipMallocHelper(T** devPtr, size_t size) - { - if (SupportsHmm() && UseHmm()) - { - return hipMallocManaged((void**)devPtr, size); - } - else - { - return hipMalloc((void**)devPtr, size); - } - return hipSuccess; - } - - // To be used in multi-process tests, in the parent process before forking children. - void InitializeRootProcess(int const numDevices_, - size_t const numElements_, - ncclDataType_t const dataType_, - bool const inPlace_, - ncclFunc_t const func_ = ncclCollBroadcast) - { - numDevices = numDevices_; - numElements = numElements_; - dataType = dataType_; - inPlace = inPlace_; - function = func_; - - inputs.resize(numDevices); - outputs.resize(numDevices); - expected.resize(numDevices); - - for (int i = 0; i < numDevices_; i++) - { - inputs[i] = (void*)mmap(NULL, sizeof(void*), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); - } - for (int i = 0; i < numDevices_; i++) - { - outputs[i] = (void*)mmap(NULL, sizeof(void*), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); - } - for (int i = 0; i < numDevices_; i++) - { - expected[i] = (void*)mmap(NULL, NumBytes(ncclOutputBuffer), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); - } - } - - void Initialize(int const numDevices_, - size_t const numElements_, - ncclDataType_t const dataType_, - bool const inPlace_, - ncclFunc_t const func_ = ncclCollBroadcast, - int const multiProcessRank_ = -1) - { - numDevices = numDevices_; - numElements = numElements_; - dataType = dataType_; - inPlace = inPlace_; - function = func_; - - if (multiProcessRank_ == -1) - { - inputs.resize(numDevices); - outputs.resize(numDevices); - expected.resize(numDevices); - } - - // Allocate per-device memory - if (multiProcessRank_ > -1) - { - HIP_CALL(hipSetDevice(multiProcessRank_)); - HIP_CALL(hipMallocHelper((void **)&inputs[multiProcessRank_], NumBytes(ncclInputBuffer))); - if (inPlace) - outputs[multiProcessRank_] = inputs[multiProcessRank_]; - else - HIP_CALL(hipMallocHelper((void **)&outputs[multiProcessRank_], NumBytes(ncclOutputBuffer))); - } - else - { - for (int i = 0; i < numDevices; i++) - { - HIP_CALL(hipSetDevice(i)); - HIP_CALL(hipMallocHelper((void **)&inputs[i], NumBytes(ncclInputBuffer))); - if (inPlace) - outputs[i] = inputs[i]; - else - HIP_CALL(hipMallocHelper((void **)&outputs[i], NumBytes(ncclOutputBuffer))); - - expected[i] = malloc(NumBytes(ncclOutputBuffer)); - } - } - - } - - // Explicit memory release to avoid double-free from subDatasets - void Release() - { - for (int i = 0; i < numDevices; i++) - { - if (!inPlace) hipFree(outputs[i]); - hipFree(inputs[i]); - free(expected[i]); - } - - outputs.clear(); - } - - // Multi-process version of Release() where each process frees its own data - void Release(int rank) - { - if (!inPlace) hipFree(outputs[rank]); - hipFree(inputs[rank]); - } - - void ReleaseRootProcess() - { - for (int i = 0; i < numDevices; i++) - { - munmap(inputs[i], sizeof(void*)); - munmap(outputs[i], sizeof(void*)); - munmap(expected[i], NumBytes(ncclOutputBuffer)); - } - inputs.clear(); - outputs.clear(); - expected.clear(); - } - - // Creates a dataset by pointing to an existing dataset - // Primarily to allow for testing with different starting byte-alignments - void ExtractSubDataset(size_t const startElement, - size_t const lastElement, - Dataset& subDataset, - int const multiProcessRank = -1) - { - ASSERT_LE(startElement, lastElement); - ASSERT_LT(lastElement, numElements); - - subDataset.numDevices = numDevices; - subDataset.numElements = lastElement - startElement + 1; - subDataset.dataType = dataType; - subDataset.inPlace = inPlace; - subDataset.function = function; - - subDataset.inputs.resize(numDevices); - subDataset.outputs.resize(numDevices); - subDataset.expected.resize(numDevices); - - size_t const byteOffset = (startElement * DataTypeToBytes(dataType)); - if (multiProcessRank != -1) - { - subDataset.inputs[multiProcessRank] = (int8_t *)inputs[multiProcessRank] + byteOffset; - subDataset.outputs[multiProcessRank] = (int8_t *)outputs[multiProcessRank] + byteOffset; - subDataset.expected[multiProcessRank] = (int8_t *)expected[multiProcessRank] + byteOffset; - } - else - { - for (int i = 0; i < numDevices; i++) - { - subDataset.inputs[i] = (int8_t *)inputs[i] + byteOffset; - subDataset.outputs[i] = (int8_t *)outputs[i] + byteOffset; - subDataset.expected[i] = (int8_t *)expected[i] + byteOffset; - } - } - } - }; - - class Barrier - { - public: - Barrier(){}; - - Barrier(int rank, int numRanks, int uniqueId) - { - this->numRanks = numRanks; - std::string uniqueIdString = std::to_string(uniqueId); - mutexName = std::string("mutex").append(uniqueIdString); - turnstile1Name = std::string("turnstile1").append(uniqueIdString); - turnstile2Name = std::string("turnstile2").append(uniqueIdString); - counterName = std::string("counter").append(uniqueIdString); - tinyBarrierName = std::string("tinyBarrier").append(uniqueIdString); - - size_t smSize = sizeof(sem_t); - - if (rank == 0) - { - NCCLCHECK_BARRIER_TEST(InitSemaphore(smSize, mutexName, 1, mutex), "InitSemaphore", rank); - NCCLCHECK_BARRIER_TEST(InitSemaphore(smSize, turnstile1Name, 0, turnstile1), "InitSemaphore", rank); - NCCLCHECK_BARRIER_TEST(InitSemaphore(smSize, turnstile2Name, 0, turnstile2), "InitSemaphore", rank); - NCCLCHECK_BARRIER_TEST(OpenSharedMemoryVariable(sizeof(int), counterName, true, counter), "OpenSharedMemoryVariable", rank); - NCCLCHECK_BARRIER_TEST(OpenSharedMemoryVariable(smSize, tinyBarrierName, true, tinyBarrier), "OpenSharedMemoryVariable", rank); - } - else - { - NCCLCHECK_BARRIER_TEST(OpenSharedMemoryVariable(smSize, tinyBarrierName, false, tinyBarrier), "OpenSharedMemoryVariable", rank); - NCCLCHECK_BARRIER_TEST(OpenSemaphore(smSize, mutexName, mutex), "OpenSemaphore", rank); - NCCLCHECK_BARRIER_TEST(OpenSemaphore(smSize, turnstile1Name, turnstile1), "OpenSemaphore", rank); - NCCLCHECK_BARRIER_TEST(OpenSemaphore(smSize, turnstile2Name, turnstile2), "OpenSemaphore", rank); - NCCLCHECK_BARRIER_TEST(OpenSharedMemoryVariable(sizeof(int), counterName, false, counter), "OpenSharedMemoryVariable", rank); - } - ncclResult_t res = Wait(20); - if (res != ncclSuccess) - { - printf("Rank %d timed out during Barrier initialization.\n", rank); - } - ClearShmFiles(uniqueId); - } - - // Wait with no timeout - void Wait() - { - Part1(); - Part2(); - } - - // Wait with timeout option - ncclResult_t Wait(int timeoutSecs) - { - NCCLCHECK_TEST(Part1(timeoutSecs), "Part 1 of Barrier Wait"); - NCCLCHECK_TEST(Part2(timeoutSecs), "Part 2 of Barrier Wait"); - - return ncclSuccess; - } - - ~Barrier() - { - size_t smSize = sizeof(sem_t); - munmap(mutex, smSize); - munmap(turnstile1, smSize); - munmap(turnstile2, smSize); - munmap(tinyBarrier, smSize); - munmap(counter, sizeof(int)); - } - - static void ClearShmFiles(int uniqueId) - { - std::string uniqueIdString = std::to_string(uniqueId); - std::vector names; - names.push_back(std::string("mutex").append(uniqueIdString)); - names.push_back(std::string("turnstile1").append(uniqueIdString)); - names.push_back(std::string("turnstile2").append(uniqueIdString)); - names.push_back(std::string("counter").append(uniqueIdString)); - names.push_back(std::string("tinyBarrier").append(uniqueIdString)); - - std::string shmDir = "/dev/shm/"; - for (auto it = names.begin(); it != names.end(); it++) - { - struct stat fileStatus; - std::string shmFullPath = shmDir + *it; - - // Check if shm file already exists; if so, unlink it - if (stat(shmFullPath.c_str(), &fileStatus) == 0) - { - shm_unlink(it->c_str()); - } - } - } - private: - template - ncclResult_t OpenSharedMemoryVariable(size_t size, std::string name, bool create, T& val) - { - int protection = PROT_READ | PROT_WRITE; - int visibility = MAP_SHARED; - int fd; - - std::string msg_open("shm_open "); - msg_open.append(name); - if (create) - { - SYSCHECKVAL_TEST(shm_open(name.c_str(), O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), msg_open.c_str(), fd); - SYSCHECK_GOTO_TEST(ftruncate(fd, size), "ftruncate", dropback); - } - else - { - do - { - fd = shm_open(name.c_str(), O_RDWR, S_IRUSR | S_IWUSR); - } while (fd == -1 && errno == ENOENT); - if (fd == -1 && errno != ENOENT) - { - printf("Call to %s failed: %s\n", msg_open.c_str(), strerror(errno)); - return ncclSystemError; - } - } - val = (T)mmap(NULL, size, protection, visibility, fd, 0); - close(fd); - if (val == MAP_FAILED) - { - goto dropback; - } - - return ncclSuccess; -dropback: - std::string msg_unlink("shm_unlink "); - msg_unlink.append(name); - SYSCHECK_TEST(shm_unlink(name.c_str()), "shm_unlink"); - return ncclSystemError; - } - - ncclResult_t InitSemaphore(size_t size, std::string name, int semValue, sem_t*& semaphore) - { - ncclResult_t res = OpenSharedMemoryVariable(size, name, true, semaphore); - std::string msg_init("sem_init "); - msg_init.append(name); - SYSCHECK_TEST(sem_init(semaphore, 1, semValue), "sem_init"); - - return res; - } - - ncclResult_t OpenSemaphore(size_t size, std::string name, sem_t*& semaphore) - { - return OpenSharedMemoryVariable(size, name, false, semaphore); - } - - void Part1() - { - sem_wait(mutex); - if (++(*counter) == numRanks) - { - sem_post_batch(turnstile1, numRanks); - } - sem_post(mutex); - sem_wait(turnstile1); - } - - void Part2() - { - sem_wait(mutex); - if (--(*counter) == 0) - { - sem_post_batch(turnstile2, numRanks); - } - sem_post(mutex); - sem_wait(turnstile2); - } - - ncclResult_t Part1(int timeoutSecs) - { - struct timespec ts; - SYSCHECK_TEST(clock_gettime(CLOCK_REALTIME, &ts), "clock_gettime 1"); - ts.tv_sec += timeoutSecs; - - SYSCHECK_TEST(sem_timedwait(mutex, &ts), "sem_timedwait 1-1"); - if (++(*counter) == numRanks) - { - SYSCHECK_TEST(sem_post_batch(turnstile1, numRanks), "sem_post_batch 1"); - } - SYSCHECK_TEST(sem_post(mutex), "sem_post 1"); - SYSCHECK_TEST(sem_timedwait(turnstile1, &ts), "sem_timedwait 1-2"); - - return ncclSuccess; - } - - ncclResult_t Part2(int timeoutSecs) - { - struct timespec ts; - SYSCHECK_TEST(clock_gettime(CLOCK_REALTIME, &ts), "clock_gettime 2"); - ts.tv_sec += timeoutSecs; - - SYSCHECK_TEST(sem_timedwait(mutex, &ts), "sem_timedwait 2"); - if (--(*counter) == 0) - { - SYSCHECK_TEST(sem_post_batch(turnstile2, numRanks), "sem_post_batch 2"); - } - SYSCHECK_TEST(sem_post(mutex), "sem_post 2"); - SYSCHECK_TEST(sem_timedwait(turnstile2, &ts), "sem_timedwait 2-2"); - - return ncclSuccess; - } - - int sem_post_batch(sem_t*& sem, int n) - { - int ret = 0; - for (int i = 0; i < n; i++) - { - ret = sem_post(sem); - if (ret != 0) break; - } - - return ret; - } - int numRanks; - - int* counter; - - sem_t* mutex; - sem_t* turnstile1; - sem_t* turnstile2; - sem_t* tinyBarrier; - - std::string mutexName; - std::string turnstile1Name; - std::string turnstile2Name; - std::string tinyBarrierName; - std::string counterName; - }; - - typedef std::tuple TestTuple; - - // Base class for each collective test - // - Each test is instantiated with a different TestTuple - class CorrectnessTest : public testing::TestWithParam - { - public: - struct PrintToStringParamName - { - std::string operator()(const testing::TestParamInfo& info) - { - std::string name; - - name += opStrings[std::get<0>(info.param)] + "_"; - name += dataTypeStrings[std::get<1>(info.param)] + "_"; - name += std::to_string(std::get<2>(info.param)) + "elements_"; - name += std::to_string(std::get<3>(info.param)) + "devices_"; - name += std::get<4>(info.param) == true ? "inplace_" : "outofplace_"; - std::string envVars = std::string(std::get<5>(info.param)); - std::replace(envVars.begin(), envVars.end(), '=', '_'); - name += envVars; - - return name; - } - - std::map opStrings - { - {ncclSum, "sum"}, - {ncclProd, "prod"}, - {ncclMax, "max"}, - {ncclMin, "min"}, - {ncclAvg, "avg"} - }; - std::map dataTypeStrings - { - {ncclInt8, "int8"}, - {ncclChar, "char"}, - {ncclUint8, "uint8"}, - {ncclInt32, "int32"}, - {ncclInt, "int"}, - {ncclUint32, "uint32"}, - {ncclInt64, "int64"}, - {ncclUint64, "uint64"}, - {ncclFloat16, "float16"}, - {ncclHalf, "half"}, - {ncclFloat32, "float32"}, - {ncclFloat64, "float64"}, - {ncclDouble, "double"}, - {ncclBfloat16, "bfloat16"} - }; - }; - protected: - // This code is called per test-tuple - void SetUp() override - { - // Make the test tuple parameters accessible - std::tie(op, dataType, numElements, numDevices, inPlace, envVals) = GetParam(); - - // Collect the number of available GPUs - HIP_CALL(hipGetDeviceCount(&numDevicesAvailable)); - - // Only proceed with testing if there are enough GPUs - if (numDevices > numDevicesAvailable) - { - fprintf(stdout, "[ SKIPPED ] Test requires %d devices (only %d available)\n", - numDevices, numDevicesAvailable); - GTEST_SKIP(); - } - - bool enableClique = false; - envString = 0; - numTokens = 0; - setenv("RCCL_TEST_ENV_VARS", "ENABLE", 1); - if (strcmp(envVals, "")) { - // enable RCCL env vars testing - envString = strdup(envVals); - tokens[numTokens] = strtok(envString, "=, "); - numTokens++; - while (tokens[numTokens-1] != NULL && numTokens < MAX_ENV_TOKENS) - tokens[numTokens++] = strtok(NULL, "=, "); - for (int i = 0; i < numTokens/2; i++) { - char *val = getenv(tokens[i*2]); - if (val) - savedEnv[i] = strdup(val); - else - savedEnv[i] = 0; - setenv(tokens[i*2], tokens[i*2+1], 1); - fprintf(stdout, "[ ] setting environmental variable %s to %s\n", tokens[i*2], getenv(tokens[i*2])); - if (strcmp(tokens[i*2], "RCCL_ENABLE_CLIQUE") == 0) - { - if (strcmp(getenv(tokens[i*2]), "1") == 0) - { - enableClique = true; - } - } - } - } - - if (Dataset::UseHmm() && enableClique) - { - fprintf(stdout, "[ SKIPPED ] Clique mode and unified memory together not supported\n"); - GTEST_SKIP(); - } - - // Initialize communicators - comms.resize(numDevices); - NCCL_CALL(ncclCommInitAll(comms.data(), numDevices, NULL)); - - // Create streams - streams.resize(numDevices); - for (int i = 0; i < numDevices; i++) - { - HIP_CALL(hipSetDevice(i)); - HIP_CALL(hipStreamCreate(&streams[i])); - } - } - - // Clean up per TestTuple - void TearDown() override - { - if (IsSkipped()) return; - - // Release communicators and streams - for (int i = 0; i < numDevices; i++) - { - NCCL_CALL(ncclCommDestroy(comms[i])); - HIP_CALL(hipStreamDestroy(streams[i])); - } - // Restore env vars after tests - for (int i = 0; i < numTokens/2; i++) { - if (savedEnv[i]) { - setenv(tokens[i*2], savedEnv[i], 1); - fprintf(stdout, "[ ] restored environmental variable %s to %s\n", tokens[i*2], getenv(tokens[i*2])); - free(savedEnv[i]); - } - else { - unsetenv(tokens[i*2]); - fprintf(stdout, "[ ] removed environmental variable %s\n", tokens[i*2]); - } - } - // Cleanup - unsetenv("RCCL_TEST_ENV_VARS"); - free(envString); - } - - void FillDatasetWithPattern(Dataset& dataset) - { - int8_t* arrayI1 = (int8_t *)malloc(dataset.NumBytes(ncclInputBuffer)); - uint8_t* arrayU1 = (uint8_t *)arrayI1; - int32_t* arrayI4 = (int32_t *)arrayI1; - uint32_t* arrayU4 = (uint32_t *)arrayI1; - int64_t* arrayI8 = (int64_t *)arrayI1; - uint64_t* arrayU8 = (uint64_t *)arrayI1; - float* arrayF4 = (float *)arrayI1; - double* arrayF8 = (double *)arrayI1; - rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1; - - // NOTE: Currently half-precision float tests are unsupported due to half being supported - // on GPU only and not host - - // Fills input data[i][j] with (i + j) % 256 - // - Keeping range small to reduce likelihood of overflow - // - Sticking with floating points values that are perfectly representable - for (int i = 0; i < dataset.numDevices; i++) - { - for (int j = 0; j < dataset.NumBytes(ncclInputBuffer)/DataTypeToBytes(dataset.dataType); j++) - { - int valueI = (i + j) % 256; - double valueF = 1.0L/((double)valueI+1.0L); - - switch (dataset.dataType) - { - case ncclInt8: arrayI1[j] = valueI; break; - case ncclUint8: arrayU1[j] = valueI; break; - case ncclInt32: arrayI4[j] = valueI; break; - case ncclUint32: arrayU4[j] = valueI; break; - case ncclInt64: arrayI8[j] = valueI; break; - case ncclUint64: arrayU8[j] = valueI; break; - case ncclFloat32: arrayF4[j] = valueF; break; - case ncclFloat64: arrayF8[j] = valueF; break; - case ncclBfloat16: arrayB2[j] = rccl_bfloat16(valueF); break; - default: - fprintf(stderr, "[ERROR] Unsupported datatype\n"); - exit(0); - } - } - - HIP_CALL(hipSetDevice(i)); - HIP_CALL(hipMemcpy(dataset.inputs[i], arrayI1, dataset.NumBytes(ncclInputBuffer), hipMemcpyHostToDevice)); - - // Fills output data[i][j] with 0 (if not inplace) - if (!dataset.inPlace) - HIP_CALL(hipMemset(dataset.outputs[i], 0, dataset.NumBytes(ncclOutputBuffer))); - } - - free(arrayI1); - } - - void Synchronize() const - { - // Wait for reduction to complete - for (int i = 0; i < numDevices; i++) - { - HIP_CALL(hipSetDevice(i)); - HIP_CALL(hipStreamSynchronize(streams[i])); - } - } - - static void Average(Dataset const& dataset, int8_t* resultI1) - { - uint8_t* resultU1 = (uint8_t *)resultI1; - int32_t* resultI4 = (int32_t *)resultI1; - uint32_t* resultU4 = (uint32_t *)resultI1; - int64_t* resultI8 = (int64_t *)resultI1; - uint64_t* resultU8 = (uint64_t *)resultI1; - float* resultF4 = (float *)resultI1; - double* resultF8 = (double *)resultI1; - rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1; - for (int j = 0; j < dataset.numElements; j++) - { - switch (dataset.dataType) - { - case ncclInt8: resultI1[j] = resultI1[j]/dataset.numDevices; break; - case ncclUint8: resultU1[j] = resultU1[j]/dataset.numDevices; break; - case ncclInt32: resultI4[j] = resultI4[j]/dataset.numDevices; break; - case ncclUint32: resultU4[j] = resultU4[j]/dataset.numDevices; break; - case ncclInt64: resultI8[j] = resultI8[j]/dataset.numDevices; break; - case ncclUint64: resultU8[j] = resultU8[j]/dataset.numDevices; break; - case ncclFloat32: resultF4[j] = resultF4[j]/dataset.numDevices; break; - case ncclFloat64: resultF8[j] = resultF8[j]/dataset.numDevices; break; - case ncclBfloat16: resultB2[j] = rccl_bfloat16((float)(resultB2[j])/dataset.numDevices); break; - default: - fprintf(stderr, "[ERROR] Unsupported datatype\n"); - exit(0); - } - } - } - - void ValidateResults(Dataset const& dataset, int root = 0) const - { - int8_t* outputI1 = (int8_t *)malloc(dataset.NumBytes(ncclOutputBuffer)); - uint8_t* outputU1 = (uint8_t *)outputI1; - int32_t* outputI4 = (int32_t *)outputI1; - uint32_t* outputU4 = (uint32_t *)outputI1; - int64_t* outputI8 = (int64_t *)outputI1; - uint64_t* outputU8 = (uint64_t *)outputI1; - float* outputF4 = (float *)outputI1; - double* outputF8 = (double *)outputI1; - rccl_bfloat16* outputB2 = (rccl_bfloat16 *)outputI1; - - bool isMatch = true; - - // Loop over each device's output and compare it to the expected output - // (Each collective operation computes its own expected results) - for (int i = 0; i < dataset.numDevices && isMatch; i++) - { - // only output on root rank is valid for gather collective - if (dataset.function == ncclCollGather && i != root) - continue; - HIP_CALL(hipMemcpy(outputI1, dataset.outputs[i], dataset.NumBytes(ncclOutputBuffer), hipMemcpyDeviceToHost)); - - int8_t* expectedI1 = (int8_t *)dataset.expected[i]; - uint8_t* expectedU1 = (uint8_t *)expectedI1; - int32_t* expectedI4 = (int32_t *)expectedI1; - uint32_t* expectedU4 = (uint32_t *)expectedI1; - int64_t* expectedI8 = (int64_t *)expectedI1; - uint64_t* expectedU8 = (uint64_t *)expectedI1; - float* expectedF4 = (float *)expectedI1; - double* expectedF8 = (double *)expectedI1; - rccl_bfloat16* expectedB2 = (rccl_bfloat16 *)expectedI1; - - for (int j = 0; j < dataset.numElements && isMatch; j++) - { - switch (dataset.dataType) - { - case ncclInt8: isMatch &= (outputI1[j] == expectedI1[j]); break; - case ncclUint8: isMatch &= (outputU1[j] == expectedU1[j]); break; - case ncclInt32: isMatch &= (outputI4[j] == expectedI4[j]); break; - case ncclUint32: isMatch &= (outputU4[j] == expectedU4[j]); break; - case ncclInt64: isMatch &= (outputI8[j] == expectedI8[j]); break; - case ncclUint64: isMatch &= (outputU8[j] == expectedU8[j]); break; - case ncclFloat32: isMatch &= (fabs(outputF4[j] - expectedF4[j]) < 1e-5); break; - case ncclFloat64: isMatch &= (fabs(outputF8[j] - expectedF8[j]) < 1e-12); break; - case ncclBfloat16: isMatch &= (fabs((float)outputB2[j] - (float)expectedB2[j]) < 9e-2); break; - default: - fprintf(stderr, "[ERROR] Unsupported datatype\n"); - exit(0); - } - - if (!isMatch) - { - switch (dataset.dataType) - { - case ncclInt8: - printf("Expected %d. Output %d on device %d[%d]\n", expectedI1[j], outputI1[j], i, j); - break; - case ncclUint8: - printf("Expected %u. Output %u on device %d[%d]\n", expectedU1[j], outputU1[j], i, j); break; - case ncclInt32: - printf("Expected %d. Output %d on device %d[%d]\n", expectedI4[j], outputI4[j], i, j); break; - case ncclUint32: - printf("Expected %u. Output %u on device %d[%d]\n", expectedU4[j], outputU4[j], i, j); break; - case ncclInt64: - printf("Expected %ld. Output %ld on device %d[%d]\n", expectedI8[j], outputI8[j], i, j); break; - case ncclUint64: - printf("Expected %lu. Output %lu on device %d[%d]\n", expectedU8[j], outputU8[j], i, j); break; - case ncclFloat32: - printf("Expected %f. Output %f on device %d[%d]\n", expectedF4[j], outputF4[j], i, j); break; - case ncclFloat64: - printf("Expected %lf. Output %lf on device %d[%d]\n", expectedF8[j], outputF8[j], i, j); break; - case ncclBfloat16: - printf("Expected %f. Output %f on device %d[%d]\n", (float)expectedB2[j], (float)outputB2[j], i, j); break; - default: - fprintf(stderr, "[ERROR] Unsupported datatype\n"); - exit(0); - } - } - } - ASSERT_EQ(isMatch, true); - } - free(outputI1); - } - - // Passed in parameters from TestTuple - ncclRedOp_t op; - ncclDataType_t dataType; - size_t numElements; - int numDevices; - bool inPlace; - const char* envVals; - - int numDevicesAvailable; - std::vector comms; - std::vector streams; - - // internal only - char* envString; - char* tokens[MAX_ENV_TOKENS]; - int numTokens; - char* savedEnv[MAX_ENV_TOKENS/2]; - }; - - class MultiProcessCorrectnessTest : public CorrectnessTest - { - protected: - // IMPORTANT: We cannot have any HIP API calls in the parent process. - // Do any HIP setup in SetupPerProcess(). - void SetUp() override - { - // Check if NCCL_COMM_ID is already set; if not, set it now - if (!getenv("NCCL_COMM_ID")) - { - char hostname[HOST_NAME_MAX+1]; - gethostname(hostname, HOST_NAME_MAX+1); - std::string hostnameString(hostname); - hostnameString.append(":55513"); - setenv("NCCL_COMM_ID", hostnameString.c_str(), 0); - } - - // Make the test tuple parameters accessible - std::tie(op, dataType, numElements, numDevices, inPlace, envVals) = GetParam(); - - envString = 0; - numTokens = 0; - bool enableClique = false; - - setenv("RCCL_TEST_ENV_VARS", "ENABLE", 1); - if (strcmp(envVals, "")) { - // enable RCCL env vars testing - envString = strdup(envVals); - tokens[numTokens] = strtok(envString, "=, "); - numTokens++; - while (tokens[numTokens-1] != NULL && numTokens < MAX_ENV_TOKENS) - tokens[numTokens++] = strtok(NULL, "=, "); - for (int i = 0; i < numTokens/2; i++) { - char *val = getenv(tokens[i*2]); - if (val) - savedEnv[i] = strdup(val); - else - savedEnv[i] = 0; - setenv(tokens[i*2], tokens[i*2+1], 1); - fprintf(stdout, "[ ] setting environmental variable %s to %s\n", tokens[i*2], getenv(tokens[i*2])); - if (strcmp(tokens[i*2], "RCCL_ENABLE_CLIQUE") == 0) - { - if (strcmp(getenv(tokens[i*2]), "1") == 0) - { - enableClique = true; - } - } - } - } - - if (Dataset::UseHmm() && enableClique) - { - fprintf(stdout, "[ SKIPPED ] Clique mode and unified memory together not supported\n"); - GTEST_SKIP(); - } - - comms.resize(numDevices); - streams.resize(numDevices); - dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); - Barrier::ClearShmFiles(StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID")))); - } - - void TearDown() override - { - munmap(dataset, sizeof(Dataset)); - - // Restore env vars after tests - for (int i = 0; i < numTokens/2; i++) { - if (savedEnv[i]) { - setenv(tokens[i*2], savedEnv[i], 1); - fprintf(stdout, "[ ] restored environmental variable %s to %s\n", tokens[i*2], getenv(tokens[i*2])); - free(savedEnv[i]); - } - else { - unsetenv(tokens[i*2]); - fprintf(stdout, "[ ] removed environmental variable %s\n", tokens[i*2]); - } - } - // Cleanup - unsetenv("RCCL_TEST_ENV_VARS"); - free(envString); - } - - void SetUpPerProcessHelper(int rank, ncclComm_t& comm, hipStream_t& stream) - { - // Check for NCCL_COMM_ID env variable (otherwise will not init) - if (!getenv("NCCL_COMM_ID")) - { - printf("Must set NCCL_COMM_ID prior to execution\n"); - exit(0); - } - - // Collect the number of available GPUs - HIP_CALL(hipGetDeviceCount(&numDevicesAvailable)); - - // Only proceed with testing if there are enough GPUs - if (numDevices > numDevicesAvailable) - { - if (rank == 0) - { - fprintf(stdout, "[ SKIPPED ] Test requires %d devices (only %d available)\n", - numDevices, numDevicesAvailable); - } - GTEST_SKIP(); - } - - HIP_CALL(hipSetDevice(rank)); - HIP_CALL(hipStreamCreate(&stream)); - - ncclUniqueId id; - NCCL_CALL(ncclGetUniqueId(&id)); - - ncclResult_t res; - res = ncclCommInitRank(&comm, numDevices, id, rank); // change to local comm and stream per process - - if (res != ncclSuccess) - { - printf("Test failure:%s %d '%s' numRanks:%d\n", __FILE__,__LINE__,ncclGetErrorString(res), numDevices); - ASSERT_EQ(res, ncclSuccess); - } - } - - // To be called by each process individually - void SetUpPerProcess(int rank, ncclFunc_t const func, ncclComm_t& comm, hipStream_t& stream, Dataset& dataset) - { - SetUpPerProcessHelper(rank, comm, stream); - if (numDevices <= numDevicesAvailable) - { - dataset.Initialize(numDevices, numElements, dataType, inPlace, func, rank); - } - } - - // To be called by each process/rank individually (see GroupCallsMultiProcess) - void SetUpPerProcess(int rank, std::vector const& func, ncclComm_t& comm, hipStream_t& stream, std::vector& datasets) - { - SetUpPerProcessHelper(rank, comm, stream); - if (numDevices <= numDevicesAvailable) - { - for (int i = 0; i < datasets.size(); i++) - { - datasets[i]->Initialize(numDevices, numElements, dataType, inPlace, func[i], rank); - } - } - } - - // Clean up per process - void TearDownPerProcess(ncclComm_t& comm, hipStream_t& stream) - { - NCCL_CALL(ncclCommDestroy(comm)); - HIP_CALL(hipStreamDestroy(stream)); - } - - void FillDatasetWithPattern(Dataset& dataset, int rank) - { - int8_t* arrayI1 = (int8_t *)malloc(dataset.NumBytes(ncclInputBuffer)); - uint8_t* arrayU1 = (uint8_t *)arrayI1; - int32_t* arrayI4 = (int32_t *)arrayI1; - uint32_t* arrayU4 = (uint32_t *)arrayI1; - int64_t* arrayI8 = (int64_t *)arrayI1; - uint64_t* arrayU8 = (uint64_t *)arrayI1; - float* arrayF4 = (float *)arrayI1; - double* arrayF8 = (double *)arrayI1; - rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1; - - // NOTE: Currently half-precision float tests are unsupported due to half being supported - // on GPU only and not host - - // Fills input data[i][j] with (i + j) % 6 - // - Keeping range small to reduce likelihood of overflow - // - Sticking with floating points values that are perfectly representable - - for (int j = 0; j < dataset.NumBytes(ncclInputBuffer)/DataTypeToBytes(dataset.dataType); j++) - { - int valueI = (rank + j) % 6; - float valueF = (float)valueI; - - switch (dataset.dataType) - { - case ncclInt8: arrayI1[j] = valueI; break; - case ncclUint8: arrayU1[j] = valueI; break; - case ncclInt32: arrayI4[j] = valueI; break; - case ncclUint32: arrayU4[j] = valueI; break; - case ncclInt64: arrayI8[j] = valueI; break; - case ncclUint64: arrayU8[j] = valueI; break; - case ncclFloat32: arrayF4[j] = valueF; break; - case ncclFloat64: arrayF8[j] = valueF; break; - case ncclBfloat16: arrayB2[j] = rccl_bfloat16(valueF); break; - default: - fprintf(stderr, "[ERROR] Unsupported datatype\n"); - exit(0); - } - } - - HIP_CALL(hipSetDevice(rank)); - HIP_CALL(hipMemcpy(dataset.inputs[rank], arrayI1, dataset.NumBytes(ncclInputBuffer), hipMemcpyHostToDevice)); - - // Fills output data[i][j] with 0 (if not inplace) - if (!dataset.inPlace) - HIP_CALL(hipMemset(dataset.outputs[rank], 0, dataset.NumBytes(ncclOutputBuffer))); - - free(arrayI1); - } - - bool ValidateResults(Dataset const& dataset, int rank, int root = 0) const - { - int8_t* outputI1 = (int8_t *)malloc(dataset.NumBytes(ncclOutputBuffer)); - uint8_t* outputU1 = (uint8_t *)outputI1; - int32_t* outputI4 = (int32_t *)outputI1; - uint32_t* outputU4 = (uint32_t *)outputI1; - int64_t* outputI8 = (int64_t *)outputI1; - uint64_t* outputU8 = (uint64_t *)outputI1; - float* outputF4 = (float *)outputI1; - double* outputF8 = (double *)outputI1; - rccl_bfloat16* outputB2 = (rccl_bfloat16 *)outputI1; - - bool isMatch = true; - - // Loop over each device's output and compare it to the expected output - // (Each collective operation computes its own expected results) - - // only output on root rank is valid for gather collective - if (dataset.function == ncclCollGather && rank != root) - return true; - - hipError_t err = hipMemcpy(outputI1, dataset.outputs[rank], dataset.NumBytes(ncclOutputBuffer), hipMemcpyDeviceToHost); - if (err != hipSuccess) - return false; - - int8_t* expectedI1 = (int8_t *)dataset.expected[rank]; - uint8_t* expectedU1 = (uint8_t *)expectedI1; - int32_t* expectedI4 = (int32_t *)expectedI1; - uint32_t* expectedU4 = (uint32_t *)expectedI1; - int64_t* expectedI8 = (int64_t *)expectedI1; - uint64_t* expectedU8 = (uint64_t *)expectedI1; - float* expectedF4 = (float *)expectedI1; - double* expectedF8 = (double *)expectedI1; - rccl_bfloat16* expectedB2 = (rccl_bfloat16 *)expectedI1; - - for (int j = 0; j < dataset.numElements && isMatch; j++) - { - switch (dataset.dataType) - { - case ncclInt8: isMatch &= (outputI1[j] == expectedI1[j]); break; - case ncclUint8: isMatch &= (outputU1[j] == expectedU1[j]); break; - case ncclInt32: isMatch &= (outputI4[j] == expectedI4[j]); break; - case ncclUint32: isMatch &= (outputU4[j] == expectedU4[j]); break; - case ncclInt64: isMatch &= (outputI8[j] == expectedI8[j]); break; - case ncclUint64: isMatch &= (outputU8[j] == expectedU8[j]); break; - case ncclFloat32: isMatch &= (outputF4[j] == expectedF4[j]); break; - case ncclFloat64: isMatch &= (outputF8[j] == expectedF8[j]); break; - case ncclBfloat16: isMatch &= (outputB2[j] == expectedB2[j]); break; - default: - fprintf(stderr, "[ERROR] Unsupported datatype\n"); - exit(0); - } - - if (!isMatch) - { - switch (dataset.dataType) - { - case ncclInt8: - printf("Output %d. Expected %d on device %d[%d]\n", outputI1[j], expectedI1[j], rank, j); break; - case ncclUint8: - printf("Output %u. Expected %u on device %d[%d]\n", outputU1[j], expectedU1[j], rank, j); break; - case ncclInt32: - printf("Output %d. Expected %d on device %d[%d]\n", outputI4[j], expectedI4[j], rank, j); break; - case ncclUint32: - printf("Output %u. Expected %u on device %d[%d]\n", outputU4[j], expectedU4[j], rank, j); break; - case ncclInt64: - printf("Output %ld. Expected %ld on device %d[%d]\n", outputI8[j], expectedI8[j], rank, j); break; - case ncclUint64: - printf("Output %lu. Expected %lu on device %d[%d]\n", outputU8[j], expectedU8[j], rank, j); break; - case ncclFloat32: - printf("Output %f. Expected %f on device %d[%d]\n", outputF4[j], expectedF4[j], rank, j); break; - case ncclFloat64: - printf("Output %lf. Expected %lf on device %d[%d]\n", outputF8[j], expectedF8[j], rank, j); break; - case ncclBfloat16: - printf("Output %f. Expected %f on device %d[%d]\n", (float)outputB2[j], (float)expectedB2[j], rank, j); break; - default: - fprintf(stderr, "[ERROR] Unsupported datatype\n"); - exit(0); - } - } - } - return isMatch; - } - - void ValidateProcesses(std::vector const& pids) - { - int numProcesses = pids.size(); - int status[numProcesses]; - - for (int i = 0; i < numProcesses; i++) - { - waitpid(pids[i], &status[i], 0); - - EXPECT_NE(WIFEXITED(status[i]), 0) << "[ERROR] Child process " << i << " did not exit cleanly."; - EXPECT_EQ(WEXITSTATUS(status[i]), EXIT_SUCCESS) << "[ERROR] Child process " << i << " had a test failure."; - } - } - - void TerminateChildProcess(bool const pass) - { - if (pass) - { - exit(EXIT_SUCCESS); - } - else - { - exit(EXIT_FAILURE); - } - } - - int StripPortNumberFromCommId(std::string commId) - { - size_t pos = commId.find(":"); - std::string portNumString = commId.substr(pos + 1); - return std::atoi(portNumString.c_str()); - } - - Dataset* dataset; - }; - - std::string GenerateTestNameString(testing::TestParamInfo& info); -} - -#endif diff --git a/test/Gather_InPlace.cpp b/test/Gather_InPlace.cpp new file mode 100644 index 0000000000..d9dec85ce1 --- /dev/null +++ b/test/Gather_InPlace.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(Gather, InPlace) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollGather}; + std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {true}; + std::vector const managedMemList = {false}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/Gather_ManagedMem.cpp b/test/Gather_ManagedMem.cpp new file mode 100644 index 0000000000..efb5134107 --- /dev/null +++ b/test/Gather_ManagedMem.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(Gather, ManagedMem) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollGather}; + std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {1}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {true}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/Gather_OutOfPlace.cpp b/test/Gather_OutOfPlace.cpp new file mode 100644 index 0000000000..49a21d4b81 --- /dev/null +++ b/test/Gather_OutOfPlace.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(Gather, OutOfPlace) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollGather}; + std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {1}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/ReduceScatter_InPlace.cpp b/test/ReduceScatter_InPlace.cpp new file mode 100644 index 0000000000..96cc283f8a --- /dev/null +++ b/test/ReduceScatter_InPlace.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(ReduceScatter, InPlace) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollReduceScatter}; + std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64}; + std::vector const redOps = {ncclSum, ncclProd}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {true}; + std::vector const managedMemList = {false}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/ReduceScatter_ManagedMem.cpp b/test/ReduceScatter_ManagedMem.cpp new file mode 100644 index 0000000000..1fc94d8b96 --- /dev/null +++ b/test/ReduceScatter_ManagedMem.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(ReduceScatter, ManagedMem) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollReduceScatter}; + std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {true}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/ReduceScatter_OutOfPlace.cpp b/test/ReduceScatter_OutOfPlace.cpp new file mode 100644 index 0000000000..09d5868a49 --- /dev/null +++ b/test/ReduceScatter_OutOfPlace.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(ReduceScatter, OutOfPlace) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollReduceScatter}; + std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; + std::vector const redOps = {ncclMin, ncclMax, ncclAvg}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/Reduce_InPlace.cpp b/test/Reduce_InPlace.cpp new file mode 100644 index 0000000000..8d26f43874 --- /dev/null +++ b/test/Reduce_InPlace.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(Reduce, InPlace) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollReduce}; + std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64}; + std::vector const redOps = {ncclSum, ncclProd}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {true}; + std::vector const managedMemList = {false}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/Reduce_ManagedMem.cpp b/test/Reduce_ManagedMem.cpp new file mode 100644 index 0000000000..95b3547b47 --- /dev/null +++ b/test/Reduce_ManagedMem.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(Reduce, ManagedMem) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollReduce}; + std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {true}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/Reduce_OutOfPlace.cpp b/test/Reduce_OutOfPlace.cpp new file mode 100644 index 0000000000..3c8eb474ea --- /dev/null +++ b/test/Reduce_OutOfPlace.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(Reduce, OutOfPlace) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollReduce}; + std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; + std::vector const redOps = {ncclMin, ncclMax, ncclAvg}; + std::vector const roots = {1}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/Scatter_InPlace.cpp b/test/Scatter_InPlace.cpp new file mode 100644 index 0000000000..99f3f983a2 --- /dev/null +++ b/test/Scatter_InPlace.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(Scatter, InPlace) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollScatter}; + std::vector const dataTypes = {ncclInt8, ncclInt32, ncclInt64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {true}; + std::vector const managedMemList = {false}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/Scatter_ManagedMem.cpp b/test/Scatter_ManagedMem.cpp new file mode 100644 index 0000000000..4a959fc308 --- /dev/null +++ b/test/Scatter_ManagedMem.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(Scatter, ManagedMem) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollScatter}; + std::vector const dataTypes = {ncclUint8, ncclUint32, ncclUint64}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {true}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/Scatter_OutOfPlace.cpp b/test/Scatter_OutOfPlace.cpp new file mode 100644 index 0000000000..02bb878930 --- /dev/null +++ b/test/Scatter_OutOfPlace.cpp @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include "TestBed.hpp" + +namespace RcclUnitTesting +{ + TEST(Scatter, OutOfPlace) + { + TestBed testBed; + + // Configuration + std::vector const funcTypes = {ncclCollScatter}; + std::vector const dataTypes = {ncclFloat32, ncclFloat64, ncclBfloat16}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {1}; + std::vector const numElements = {1048576, 53327, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList); + testBed.Finalize(); + } +} diff --git a/test/TestChecks.hpp b/test/TestChecks.hpp deleted file mode 100644 index c2e3331d02..0000000000 --- a/test/TestChecks.hpp +++ /dev/null @@ -1,63 +0,0 @@ -#ifndef TESTCHECKS_HPP -#define TESTCHECKS_HPP - -#define HIP_CALL(x) ASSERT_EQ(x, hipSuccess) -#define NCCL_CALL(x) ASSERT_EQ(x, ncclSuccess) - -#define SYSCHECK_TEST(call, name) do { \ - int retval; \ - SYSCHECKVAL_TEST(call, name, retval); \ -} while (false) - -#define SYSCHECKVAL_TEST(call, name, retval) do { \ - SYSCHECKSYNC_TEST(call, name, retval); \ - if (retval == -1) { \ - printf("Call to %s failed : %s\n", name, strerror(errno)); \ - fflush(stdout); \ - return ncclSystemError; \ - } \ -} while (false) - -#define SYSCHECK_GOTO_TEST(call, name, label) do { \ - int retval; \ - SYSCHECKVAL_GOTO_TEST(call, name, retval, label); \ -} while (false) - -#define SYSCHECKVAL_GOTO_TEST(call, name, retval, label) do { \ - SYSCHECKSYNC_TEST(call, name, retval); \ - if (retval == -1) { \ - printf("Call to %s failed : %s\n", name, strerror(errno)); \ - fflush(stdout); \ - goto label; \ - } \ -} while (false) - -#define SYSCHECKSYNC_TEST(call, name, retval) do { \ - retval = call; \ - if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ - } else { \ - break; \ - } \ -} while(true) - -#define NCCLCHECK_BARRIER_TEST(call, name, rank) do { \ - ncclResult_t retval; \ - retval = call; \ - if (retval != ncclSuccess) { \ - printf("Rank %d call to %s failed : %s\n", rank, name, strerror(errno)); \ - fflush(stdout); \ - return; \ - } \ -} while (false) - -#define NCCLCHECK_TEST(call, name) do { \ - ncclResult_t retval; \ - retval = call; \ - if (retval != ncclSuccess) { \ - printf("Call to %s failed : %s\n", name, strerror(errno)); \ - fflush(stdout); \ - return retval; \ - } \ -} while (false) - -#endif diff --git a/test/common/CollectiveArgs.cpp b/test/common/CollectiveArgs.cpp new file mode 100644 index 0000000000..8c908fef44 --- /dev/null +++ b/test/common/CollectiveArgs.cpp @@ -0,0 +1,282 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "CollectiveArgs.hpp" +#include "gtest/gtest.h" + +namespace RcclUnitTesting +{ + ErrCode CollectiveArgs::SetArgs(int const globalRank, + int const totalRanks, + int const deviceId, + ncclFunc_t const funcType, + ncclDataType_t const dataType, + ncclRedOp_t const redOp, + int const root, + size_t const numInputElements, + size_t const numOutputElements, + ScalarTransport const scalarTransport, + int const scalarMode) + { + // Free scalar based on previous scalarMode + if (scalarMode != -1) + { + if (this->localScalar.ptr != nullptr) + { + if (this->scalarMode == 0) this->localScalar.FreeGpuMem(); + if (this->scalarMode == 1) hipHostFree(this->localScalar.ptr); + } + } + + this->globalRank = globalRank; + this->totalRanks = totalRanks; + this->deviceId = deviceId; + this->funcType = funcType; + this->dataType = dataType; + this->redOp = redOp; + this->root = root; + this->numInputElements = numInputElements; + this->numOutputElements = numOutputElements; + this->scalarTransport = scalarTransport; + this->scalarMode = scalarMode; + + if (scalarMode != -1) + { + size_t const numBytes = DataTypeToBytes(dataType); + if (scalarMode == ncclScalarDevice) + { + CHECK_CALL(this->localScalar.AllocateGpuMem(numBytes)); + CHECK_HIP(hipMemcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes), + numBytes, hipMemcpyHostToDevice)); + } + else if (scalarMode == ncclScalarHostImmediate) + { + CHECK_HIP(hipHostMalloc(&this->localScalar.ptr, numBytes, 0)); + memcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes), numBytes); + } + } + return TEST_SUCCESS; + } + + ErrCode CollectiveArgs::AllocateMem(bool const inPlace, + bool const useManagedMem) + { + this->numInputBytesAllocated = this->numInputElements * DataTypeToBytes(this->dataType); + this->numOutputBytesAllocated = this->numOutputElements * DataTypeToBytes(this->dataType); + this->numInputElementsAllocated = this->numInputElements; + this->numOutputElementsAllocated = this->numOutputElements; + this->inPlace = inPlace; + this->useManagedMem = useManagedMem; + + if (hipSetDevice(this->deviceId) != hipSuccess) + { + ERROR("Unable to call hipSetDevice to set to GPU %d\n", this->deviceId); + return TEST_FAIL; + } + + if (inPlace) + { + if (this->funcType == ncclCollScatter) + { + CHECK_CALL(this->inputGpu.AllocateGpuMem(this->numInputBytesAllocated, useManagedMem)); + this->outputGpu.Attach(this->inputGpu.U1 + (this->globalRank * this->numOutputBytesAllocated)); + } + else if (this->funcType == ncclCollGather) + { + CHECK_CALL(this->outputGpu.AllocateGpuMem(this->numOutputBytesAllocated, useManagedMem)); + this->inputGpu.Attach(this->outputGpu.U1 + (this->globalRank * this->numInputBytesAllocated)); + } + else + { + size_t const numBytes = std::max(this->numInputBytesAllocated, this->numOutputBytesAllocated); + CHECK_CALL(this->inputGpu.AllocateGpuMem(numBytes, useManagedMem)); + this->outputGpu.Attach(this->inputGpu.ptr); + } + CHECK_CALL(this->expected.AllocateCpuMem(this->numOutputBytesAllocated)); + } + else + { + CHECK_CALL(this->inputGpu.AllocateGpuMem(this->numInputBytesAllocated, useManagedMem)); + CHECK_CALL(this->outputGpu.AllocateGpuMem(this->numOutputBytesAllocated, useManagedMem)); + CHECK_CALL(this->expected.AllocateCpuMem(this->numOutputBytesAllocated)); + } + CHECK_CALL(this->outputCpu.AllocateCpuMem(this->numOutputBytesAllocated)); + return TEST_SUCCESS; + } + + ErrCode CollectiveArgs::PrepareData(CollFuncPtr const prepareDataFunc) + { + CollFuncPtr prepFunc = (prepareDataFunc == nullptr ? DefaultPrepareDataFunc : prepareDataFunc); + return prepFunc(*this); + } + + ErrCode CollectiveArgs::ValidateResults() + { + // Ignore non-root outputs for collectives with a root + if (CollectiveArgs::UsesRoot(this->funcType) && this->root != this->globalRank) return TEST_SUCCESS; + + size_t const numOutputBytes = (this->numOutputElements * DataTypeToBytes(this->dataType)); + + CHECK_HIP(hipMemcpy(this->outputCpu.ptr, this->outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost)); + + bool isMatch = true; + CHECK_CALL(this->outputCpu.IsEqual(this->dataType, + this->numOutputElements, + this->expected, + true, + isMatch)); + if (!isMatch) ERROR("Mismatch for %s\n", this->GetDescription().c_str()); + return isMatch ? TEST_SUCCESS : TEST_FAIL; + } + + ErrCode CollectiveArgs::DeallocateMem() + { + // If in-place, either only inputGpu or outputGpu was allocated + if (this->inPlace) + { + if (this->funcType == ncclCollGather) + this->outputGpu.FreeGpuMem(); + else + this->inputGpu.FreeGpuMem(); + } + else + { + this->inputGpu.FreeGpuMem(); + this->outputGpu.FreeGpuMem(); + } + + this->outputCpu.FreeCpuMem(); + this->expected.FreeCpuMem(); + + if (this->localScalar.ptr != nullptr) + { + if (this->scalarMode == 0) this->localScalar.FreeGpuMem(); + if (this->scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr)); + } + return TEST_SUCCESS; + } + + std::string CollectiveArgs::GetDescription() const + { + std::stringstream ss; + + ss << "(Rank " << this->globalRank << ") "; + switch (this->funcType) + { + case ncclCollBroadcast: ss << "ncclBroadcast"; break; + case ncclCollReduce: ss << "ncclReduce"; break; + case ncclCollAllGather: ss << "ncclAllGather"; break; + case ncclCollReduceScatter: ss << "ncclReduceScatter"; break; + case ncclCollAllReduce: ss << "ncclAllReduce"; break; + case ncclCollGather: ss << "ncclGather"; break; + case ncclCollScatter: ss << "ncclScatter"; break; + case ncclCollAllToAll: ss << "ncclAllToAll"; break; + case ncclCollSend: ss << "ncclSend"; break; + case ncclCollRecv: ss << "ncclRevv"; break; + default: ss << "[Unknown]"; break; + } + + ss << " " << ncclDataTypeNames[this->dataType] << " "; + if (this->funcType == ncclCollReduce || + this->funcType == ncclCollReduceScatter || + this->funcType == ncclCollAllReduce) + { + if (this->redOp < ncclNumOps) + { + ss << ncclRedOpNames[this->redOp] << " "; + } + else + { + ss << "CustomScalar "; + PtrUnion scalarsPerRank; + scalarsPerRank.Attach(scalarsPerRank.ptr); + switch (this->dataType) + { + case ncclInt8: ss << scalarsPerRank.I1[this->globalRank]; break; + case ncclUint8: ss << scalarsPerRank.U1[this->globalRank]; break; + case ncclInt32: ss << scalarsPerRank.I4[this->globalRank]; break; + case ncclUint32: ss << scalarsPerRank.U4[this->globalRank]; break; + case ncclInt64: ss << scalarsPerRank.I8[this->globalRank]; break; + case ncclUint64: ss << scalarsPerRank.U8[this->globalRank]; break; + case ncclFloat32: ss << scalarsPerRank.F4[this->globalRank]; break; + case ncclFloat64: ss << scalarsPerRank.F8[this->globalRank]; break; + case ncclBfloat16: ss << scalarsPerRank.B2[this->globalRank]; break; + default: ss << "(UNKNOWN)"; + } + ss << " "; + } + } + + if (this->funcType == ncclCollBroadcast || + this->funcType == ncclCollReduce || + this->funcType == ncclCollGather || + this->funcType == ncclCollScatter) + { + ss << "Root " << this->root << " "; + } + + if (this->funcType == ncclCollSend || + this->funcType == ncclCollRecv) + { + ss << "Peer " << this->root << " "; + } + + ss << "#In: " << this->numInputElements; + ss << " #Out: " << this->numOutputElements; + + return ss.str(); + } + + void CollectiveArgs::GetNumElementsForFuncType(ncclFunc_t const funcType, + int const N, + int const totalRanks, + int* numInputElements, + int* numOutputElements) + { + switch (funcType) + { + case ncclCollBroadcast: + case ncclCollReduce: + case ncclCollAllReduce: + *numInputElements = N; + *numOutputElements = N; + break; + case ncclCollGather: + case ncclCollAllGather: + *numInputElements = N; + *numOutputElements = totalRanks * N; + break; + case ncclCollScatter: + case ncclCollReduceScatter: + *numInputElements = totalRanks * N; + *numOutputElements = N; + break; + case ncclCollAllToAll: + *numInputElements = totalRanks * N; + *numOutputElements = totalRanks * N; + break; + default: + *numInputElements = N; + *numOutputElements = N; + break; + } + } + + bool CollectiveArgs::UsesReduce(ncclFunc_t const funcType) + { + return (funcType == ncclCollReduce || + funcType == ncclCollAllReduce || + funcType == ncclCollReduceScatter); + } + + bool CollectiveArgs::UsesRoot(ncclFunc_t const funcType) + { + return (funcType == ncclCollBroadcast || + funcType == ncclCollReduce || + funcType == ncclCollGather || + funcType == ncclCollScatter); + } +} diff --git a/test/common/CollectiveArgs.hpp b/test/common/CollectiveArgs.hpp new file mode 100644 index 0000000000..39fef1096c --- /dev/null +++ b/test/common/CollectiveArgs.hpp @@ -0,0 +1,151 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * +b + * See LICENSE.txt for license information + ************************************************************************/ +#pragma once +#include "PtrUnion.hpp" +#include "PrepDataFuncs.hpp" +#include "rccl.h" + +namespace RcclUnitTesting +{ + // Enumeration of all collective functions currently supported + typedef enum + { + ncclCollBroadcast = 0, + ncclCollReduce, + ncclCollAllGather, + ncclCollReduceScatter, + ncclCollAllReduce, + ncclCollGather, + ncclCollScatter, + ncclCollAllToAll, + ncclCollSend, + ncclCollRecv, + ncclNumFuncs + } ncclFunc_t; + + char const ncclFuncNames[ncclNumFuncs][32] = + { + "Broadcast", + "Reduce", + "AllGather", + "ReduceScatter", + "AllReduce", + "Gather", + "Scatter", + "AllToAll", + "Send", + "Recv" + }; + + char const ncclDataTypeNames[ncclNumTypes][32] = + { + "ncclInt8", + "ncclUint8", + "ncclInt32", + "ncclUint32", + "ncclInt64", + "ncclUint64", + "ncclFloat16", + "ncclFloat32", + "ncclFloat64", + "ncclBfloat16" + }; + + char const ncclRedOpNames[ncclNumOps][32] = + { + "sum", + "prod", + "max", + "min", + "avg" + }; + + class CollectiveArgs; + + #define MAX_RANKS 32 + struct ScalarTransport + { + char ptr[MAX_RANKS * sizeof(double)]; + }; + + // Function pointer for functions that operate on CollectiveArgs + // e.g. For filling input / computing expected results + typedef ErrCode (*CollFuncPtr)(CollectiveArgs &); + + class CollectiveArgs + { + public: + // Arguments to execute + int globalRank; + int totalRanks; + int deviceId; + ncclFunc_t funcType; + ncclDataType_t dataType; + ncclRedOp_t redOp; + int root; // Used as "peer" for Send/Recv + size_t numInputElements; + size_t numOutputElements; + ScalarTransport scalarTransport; // Used for custom reduction operators + PtrUnion localScalar; + int scalarMode; // -1 if scalar not used + + // Data + PtrUnion inputGpu; + PtrUnion outputGpu; + PtrUnion outputCpu; + PtrUnion expected; + bool inPlace; + bool useManagedMem; + size_t numInputBytesAllocated; + size_t numOutputBytesAllocated; + size_t numInputElementsAllocated; + size_t numOutputElementsAllocated; + + // Set collective arguments + ErrCode SetArgs(int const globalRank, + int const totalRanks, + int const deviceId, + ncclFunc_t const funcType, + ncclDataType_t const dataType, + ncclRedOp_t const redOp, + int const root, + size_t const numInputElements, + size_t const numOutputElements, + ScalarTransport const scalarsPerRank, + int const scalarMode = -1); + + // Allocates GPU memory for input/output and CPU memory for expected + // When inPlace is true, input and output share the same memory + ErrCode AllocateMem(bool const inPlace, + bool const useManagedMem); + + // Execute the provided data preparation function to fill input and compute expected results + ErrCode PrepareData(CollFuncPtr const prepareDataFunc); + + // Compare outputs to expected values + ErrCode ValidateResults(); + + // Deallocate memory + ErrCode DeallocateMem(); + + // Provide a description for the current collective arguments + std::string GetDescription() const; + + // Returns the number of inputs/outputs based on collective function type + static void GetNumElementsForFuncType(ncclFunc_t const funcType, + int const N, + int const totalRanks, + int* numInputElements, + int* numOutputElements); + + // Returns true if collective function performs reduction + static bool UsesReduce(ncclFunc_t const funcType); + + // Returns true if collective function utilizes a root rank + static bool UsesRoot(ncclFunc_t const funcType); + }; +} diff --git a/test/common/EnvVars.cpp b/test/common/EnvVars.cpp new file mode 100644 index 0000000000..5f8d615d49 --- /dev/null +++ b/test/common/EnvVars.cpp @@ -0,0 +1,161 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "EnvVars.hpp" +#include "CollectiveArgs.hpp" +#include + +namespace RcclUnitTesting +{ + int const UT_SINGLE_PROCESS = (1<<0); + int const UT_MULTI_PROCESS = (1<<1); + + hsa_status_t CountGpus(hsa_agent_t agent, void* data) + { + int* currCount = (int*)data; + hsa_device_type_t device; + hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device); + if (device == HSA_DEVICE_TYPE_GPU) + *currCount = *currCount + 1; + return HSA_STATUS_SUCCESS; + } + + EnvVars::EnvVars() + { + // Collect number of GPUs available + // NOTE: Cannot use HIP call prior to launching child processes via fork so use HSA + int numDevicesAvailable = 0; + hsa_init(); + hsa_iterate_agents(CountGpus, &numDevicesAvailable); + hsa_shut_down(); + + showNames = GetEnvVar("UT_SHOW_NAMES" , 1); + minGpus = GetEnvVar("UT_MIN_GPUS" , 2); + maxGpus = GetEnvVar("UT_MAX_GPUS" , numDevicesAvailable); + processMask = GetEnvVar("UT_PROCESS_MASK", UT_SINGLE_PROCESS | UT_MULTI_PROCESS); + verbose = GetEnvVar("UT_VERBOSE" , 0); + printValues = GetEnvVar("UT_PRINT_VALUES", 0); + + // Limit number of supported reduction operators to just ncclSum if only allReduce is built +#ifdef BUILD_ALLREDUCE_ONLY + int numOps = 1; +#else + int numOps = ncclNumOps; +#endif + std::vector redOpStrings = GetEnvVarsList("UT_REDOPS"); + for (auto s : redOpStrings) + { + for (int i = 0; i < numOps; ++i) + { + if (!strcmp(s.c_str(), ncclRedOpNames[i])) + { + redOps.push_back((ncclRedOp_t)i); + break; + } + } + } + // Default back to all ops if no strings are found + if (redOps.empty()) + { + for (int i = 0; i < numOps; i++) + redOps.push_back((ncclRedOp_t)i); + } + + // Limit number of supported datatypes if only allReduce is built + std::vector dtStrings = GetEnvVarsList("UT_DATATYPES"); + for (auto s : dtStrings) + { + for (int i = 0; i < ncclNumTypes; ++i) + { + if (!strcmp(s.c_str(), ncclDataTypeNames[i])) + { +#ifdef BUILD_ALLREDUCE_ONLY + if (i == ncclFloat32) +#endif + { + dataTypes.push_back((ncclDataType_t)i); + } + } + } + } + + // Default option if no valid datatypes are found in env var + if (dataTypes.empty()) + { + dataTypes.push_back(ncclFloat32); + // Skip all but 32-bit floats if only AllReduce is being built +#ifndef BUILD_ALLREDUCE_ONLY + dataTypes.push_back(ncclInt8); + dataTypes.push_back(ncclUint8); + dataTypes.push_back(ncclInt32); + dataTypes.push_back(ncclUint32); + dataTypes.push_back(ncclInt64); + dataTypes.push_back(ncclUint64); + // Half-precision floats disabled due to lack of host-side support + // dataTypes.push_back(ncclFloat16); + dataTypes.push_back(ncclFloat32); + dataTypes.push_back(ncclFloat64); + dataTypes.push_back(ncclBfloat16); +#endif + } + } + + std::vector const& EnvVars::GetAllSupportedRedOps() + { + return redOps; + } + + std::vector const& EnvVars::GetAllSupportedDataTypes() + { + return dataTypes; + } + + int EnvVars::GetEnvVar(std::string const varname, int defaultValue) + { + if (getenv(varname.c_str())) + return atoi(getenv(varname.c_str())); + return defaultValue; + }; + + std::vector EnvVars::GetEnvVarsList(std::string const varname) + { + std::vector result; + if (getenv(varname.c_str())) + { + char* token = strtok(getenv(varname.c_str()), ",;"); + while (token != NULL) + { + result.push_back(token); + token = strtok(NULL, ",;"); + } + } + return result; + } + + void EnvVars::ShowConfig() + { + std::vector> supported = + { + std::make_pair("UT_SHOW_NAMES" , "Show test case names"), + std::make_pair("UT_MIN_GPUS" , "Minimum number of GPUs to use"), + std::make_pair("UT_MAX_GPUS" , "Maximum number of GPUs to use"), + std::make_pair("UT_PROCESS_MASK", "Whether to run single/multi process"), + std::make_pair("UT_VERBOSE" , "Show verbose unit test output"), + std::make_pair("UT_REDOPS" , "List of reduction ops to test"), + std::make_pair("UT_DATATYPES" , "List of datatypes to test"), + std::make_pair("UT_PRINT_VALUES", "Print array values (# of values to print, < 0 for all)") + }; + + printf("================================================================================\n"); + printf(" Environment variables:\n"); + for (auto p : supported) + { + printf(" - %-20s %-40s %s\n", p.first.c_str(), p.second.c_str(), + getenv(p.first.c_str()) ? getenv(p.first.c_str()) : ""); + } + printf("================================================================================\n"); + } +} diff --git a/test/common/EnvVars.hpp b/test/common/EnvVars.hpp new file mode 100644 index 0000000000..87b9f07f07 --- /dev/null +++ b/test/common/EnvVars.hpp @@ -0,0 +1,44 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#pragma once +#include +#include +#include "rccl.h" + +namespace RcclUnitTesting +{ + // Helper function to count the number of GPUs on system + static hsa_status_t CountGpus(hsa_agent_t agent, void* data); + + // Helper class to track environment variables that affect the unit tests + class EnvVars + { + public: + bool showNames; // List test case names during run [UT_SHOW_NAMES] + int minGpus; // Set the minimum number of GPUs to use [UT_MIN_GPUS] + int maxGpus; // Set the maximum number of GPUs to use [UT_MAX_GPUS] + int processMask; // Filter single/multi process [UT_PROCESS_MASK] + bool verbose; // Show verbose TestBed output for debug [UT_VERBOSE] + int printValues; // Print out input/output/expected arrays [UT_PRINT_VALUES] + + // Constructor that parses and collects environment variables + EnvVars(); + + std::vector const& GetAllSupportedRedOps(); + std::vector const& GetAllSupportedDataTypes(); + + static void ShowConfig(); + + protected: + std::vector redOps; // Supported reduction ops [UT_REDOPS] + std::vector dataTypes; // Support datatypes [UT_DATATYPES] + + // Helper functions to parse environment variables + int GetEnvVar(std::string const varname, int defaultValue); + std::vector GetEnvVarsList(std::string const varname); + }; +} diff --git a/test/common/ErrCode.hpp b/test/common/ErrCode.hpp new file mode 100644 index 0000000000..426b2e976a --- /dev/null +++ b/test/common/ErrCode.hpp @@ -0,0 +1,38 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#pragma once + +namespace RcclUnitTesting +{ + typedef enum + { + TEST_SUCCESS = 0, + TEST_FAIL = 1 + } ErrCode; + +#define ERROR(...) printf("\033[0;31m" "[ ERROR ] " "\033[0m" __VA_ARGS__) +#define INFO(...) printf("[ INFO ] " __VA_ARGS__) + +#define CHECK_CALL(func) \ + { \ + ErrCode status = func; \ + if (status != TEST_SUCCESS) \ + { \ + ERROR("Error in call %s\n", #func); \ + return status; \ + } \ + } + +#define CHECK_HIP(func) \ + { \ + hipError_t error = (func); \ + if (error != hipSuccess) \ + { \ + fprintf(stderr, "\033[0;33" "[ ERROR ] HIP error: %s\n" "\033[m", hipGetErrorString(error)); \ + return TEST_FAIL; \ + } \ + } +} diff --git a/test/common/PrepDataFuncs.cpp b/test/common/PrepDataFuncs.cpp new file mode 100644 index 0000000000..966a640cd3 --- /dev/null +++ b/test/common/PrepDataFuncs.cpp @@ -0,0 +1,342 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "CollectiveArgs.hpp" +#include "PrepDataFuncs.hpp" +#include +#include + +namespace RcclUnitTesting +{ + ErrCode DefaultPrepareDataFunc(CollectiveArgs &collArgs) + { + switch (collArgs.funcType) + { + case ncclCollBroadcast: return DefaultPrepData_Broadcast(collArgs); + case ncclCollReduce: return DefaultPrepData_Reduce(collArgs, false); + case ncclCollAllGather: return DefaultPrepData_Gather(collArgs, true); + case ncclCollReduceScatter: return DefaultPrepData_ReduceScatter(collArgs); + case ncclCollAllReduce: return DefaultPrepData_Reduce(collArgs, true); + case ncclCollGather: return DefaultPrepData_Gather(collArgs, false); + case ncclCollScatter: return DefaultPrepData_Scatter(collArgs); + case ncclCollAllToAll: return DefaultPrepData_AllToAll(collArgs); + //case ncclCollSendRecv: return DefaultPrepData_SendRecv(collArgs); + default: + ERROR("Unknown func type %d\n", collArgs.funcType); + return TEST_FAIL; + } + } + + ErrCode CheckAllocation(CollectiveArgs const& collArgs) + { + if (collArgs.numInputElements > collArgs.numInputElementsAllocated) + { + ERROR("Number of input elements (%lu) exceeds the number of allocated input elements (%lu)\n", + collArgs.numInputElements, collArgs.numInputElementsAllocated); + return TEST_FAIL; + } + + if (collArgs.numOutputElements > collArgs.numOutputElementsAllocated) + { + ERROR("Number of output elements (%lu) exceeds the number of allocated output elements (%lu)\n", + collArgs.numOutputElements, collArgs.numOutputElementsAllocated); + return TEST_FAIL; + } + return TEST_SUCCESS; + } + + ErrCode DefaultPrepData_Broadcast(CollectiveArgs &collArgs) + { + CHECK_CALL(CheckAllocation(collArgs)); + if (collArgs.numInputElements != collArgs.numOutputElements) + { + ERROR("Number of input elements must match number of output elements for Broadcast\n"); + return TEST_FAIL; + } + + size_t const numBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType); + + // Clear output for all ranks (done before filling input in case of in-place) + CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numBytes)); + + // Only root needs input pattern + if (collArgs.globalRank == collArgs.root) + CHECK_CALL(collArgs.inputGpu.FillPattern(collArgs.dataType, + collArgs.numInputElements, + collArgs.root, true)); + + // Otherwise all other ranks expected output is the same as input of root + return collArgs.expected.FillPattern(collArgs.dataType, + collArgs.numInputElements, + collArgs.root, + false); + } + + ErrCode DefaultPrepData_Reduce(CollectiveArgs &collArgs, bool const isAllReduce) + { + CHECK_CALL(CheckAllocation(collArgs)); + if (collArgs.numInputElements != collArgs.numOutputElements) + { + ERROR("Number of input elements must match number of output elements for Reduce\n"); + return TEST_FAIL; + } + + size_t const numBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType); + + // Clear output for all ranks (done before filling input in case of in-place) + CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numBytes)); + + // Clear expected buffer for holding reduction + PtrUnion result; + CHECK_CALL(result.Attach(collArgs.expected)); + CHECK_CALL(result.ClearCpuMem(numBytes)); + + // If average or custom reduction operator is used, perform a summation instead + ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp); + + // Loop over each rank and generate their input into a temp buffer, then reduce + PtrUnion scalarsPerRank; + scalarsPerRank.Attach(collArgs.scalarTransport.ptr); + + PtrUnion tempInputCpu; + CHECK_CALL(tempInputCpu.Attach(collArgs.outputCpu)); + for (int rank = 0; rank < collArgs.totalRanks; ++rank) + { + // Generate temporary input for this rank + CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false)); + + // Copy the pre-scaled input into GPU memory for the correct rank + if (rank == collArgs.globalRank) + { + CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numBytes, hipMemcpyHostToDevice)); + } + + // Scale the temporary input by local scalar for this rank + // (Used by custom reduction ops) + if (collArgs.scalarMode >= 0) + { + CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements, + scalarsPerRank, rank)); + } + + // Any rank that requires output reduces the scaled-inputs + if (isAllReduce || collArgs.root == collArgs.globalRank) + { + if (rank == 0) + { + memcpy(result.ptr, tempInputCpu.ptr, numBytes); + } + else + { + CHECK_CALL(result.Reduce(collArgs.dataType, collArgs.numInputElements, + tempInputCpu, tempOp)); + } + } + } + + // Perform averaging if necessary + if (collArgs.redOp == ncclAvg && (isAllReduce || collArgs.root == collArgs.globalRank)) + { + CHECK_CALL(result.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks)); + } + return TEST_SUCCESS; + } + + ErrCode DefaultPrepData_Gather(CollectiveArgs &collArgs, bool const isAllGather) + { + CHECK_CALL(CheckAllocation(collArgs)); + if (collArgs.totalRanks * collArgs.numInputElements != collArgs.numOutputElements) + { + ERROR("# of output elements must be total ranks * # input elements for AllGather\n"); + return TEST_FAIL; + } + + // Clear output for all ranks (done before filling input in case of in-place) + size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType); + size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType); + CHECK_CALL(collArgs.inputGpu.ClearGpuMem(numInputBytes)); + CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numOutputBytes)); + + PtrUnion result; + CHECK_CALL(result.Attach(collArgs.expected.ptr)); + CHECK_CALL(result.ClearCpuMem(numOutputBytes)); + + // Use outputCpu buffer to store temporary input + PtrUnion tempInputCpu; + CHECK_CALL(tempInputCpu.Attach(collArgs.outputCpu.ptr)); + + for (int rank = 0; rank < collArgs.totalRanks; ++rank) + { + CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false)); + if (rank == collArgs.globalRank) + { + CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numInputBytes, hipMemcpyHostToDevice)); + } + if (isAllGather || collArgs.root == collArgs.globalRank) + { + memcpy(result.I1 + (rank * numInputBytes), tempInputCpu.ptr, numInputBytes); + } + } + return TEST_SUCCESS; + } + + ErrCode DefaultPrepData_ReduceScatter(CollectiveArgs &collArgs) + { + CHECK_CALL(CheckAllocation(collArgs)); + if (collArgs.numInputElements != collArgs.numOutputElements * collArgs.totalRanks) + { + ERROR("# of input elements must be total ranks * # output elements for ReduceScatter\n"); + return TEST_FAIL; + } + + size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType); + size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType); + + // Clear output for all ranks (done before filling input in case of in-place) + CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numOutputBytes)); + + PtrUnion tempInputCpu; + PtrUnion tempResultCpu; + + CHECK_CALL(tempInputCpu.AllocateCpuMem(numInputBytes)); + CHECK_CALL(tempResultCpu.AllocateCpuMem(numInputBytes)); + CHECK_CALL(tempResultCpu.ClearCpuMem(numInputBytes)); + + // If average or custom reduction operator is used, perform a summation instead + ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp); + + // Loop over each rank and generate the input / scale / reduce + PtrUnion scalarsPerRank; + scalarsPerRank.Attach(collArgs.scalarTransport.ptr); + for (int rank = 0; rank < collArgs.totalRanks; ++rank) + { + CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false)); + + if (rank == collArgs.globalRank) + { + if (hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numInputBytes, hipMemcpyHostToDevice) != hipSuccess) + { + ERROR("hipMemcpy to input failed\n"); + CHECK_CALL(tempInputCpu.FreeCpuMem()); + CHECK_CALL(tempResultCpu.FreeCpuMem()); + return TEST_FAIL; + } + } + + // Scale the temporary input by local scalar for this rank + // (Used by custom reduction ops) + if (collArgs.scalarMode >= 0) + { + CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements, + scalarsPerRank, rank)); + } + + if (rank == 0) + { + memcpy(tempResultCpu.ptr, tempInputCpu.ptr, numInputBytes); + } + else + { + CHECK_CALL(tempResultCpu.Reduce(collArgs.dataType, collArgs.numInputElements, + tempInputCpu, tempOp)); + } + } + + // Perform averaging if necessary + if (collArgs.redOp == ncclAvg) + { + CHECK_CALL(tempResultCpu.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks)); + } + + // Copy over portion of result + memcpy(collArgs.expected.I1, + tempResultCpu.I1 + collArgs.globalRank * numOutputBytes, + numOutputBytes); + CHECK_CALL(tempInputCpu.FreeCpuMem()); + CHECK_CALL(tempResultCpu.FreeCpuMem()); + return TEST_SUCCESS; + } + + ErrCode DefaultPrepData_Scatter(CollectiveArgs &collArgs) + { + CHECK_CALL(CheckAllocation(collArgs)); + if (collArgs.numInputElements != collArgs.numOutputElements * collArgs.totalRanks) + { + ERROR("# of input elements must be total ranks * # output elements for Scatter\n"); + return TEST_FAIL; + } + + size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType); + size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType); + + // Clear outputs on all ranks (prior to input in case of in-place) + collArgs.outputGpu.ClearGpuMem(numOutputBytes); + + // Generate input as if on root rank - each rank will receive a portion + PtrUnion tempInput; + tempInput.AllocateCpuMem(numInputBytes); + tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.root, false); + + // Copy input to root rank + if (collArgs.globalRank == collArgs.root) + { + if (hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice) != hipSuccess) + { + ERROR("hipMemcpy to input failed\n"); + tempInput.FreeCpuMem(); + return TEST_FAIL; + } + } + else + { + collArgs.inputGpu.ClearGpuMem(numInputBytes); + } + + // Each rank receive a portion of the input + memcpy(collArgs.expected.U1, tempInput.U1 + (collArgs.globalRank * numOutputBytes), numOutputBytes); + + tempInput.FreeCpuMem(); + return TEST_SUCCESS; + } + + ErrCode DefaultPrepData_AllToAll(CollectiveArgs &collArgs) + { + CHECK_CALL(CheckAllocation(collArgs)); + if (collArgs.numInputElements != collArgs.numOutputElements) + { + ERROR("Number of input elements must match number of output elements for AllToAll\n"); + return TEST_FAIL; + } + if (collArgs.numInputElements % collArgs.totalRanks) + { + ERROR("Input / Output size for AllToAll must be a multiple of %d\n", collArgs.totalRanks); + return TEST_FAIL; + } + size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType); + size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType); + size_t const numBytes = numInputBytes / collArgs.totalRanks; + + // Clear outputs on all ranks (prior to input in case of in-place) + collArgs.outputGpu.ClearGpuMem(numOutputBytes); + + // Generate input on root rank - each rank will receive a portion + PtrUnion tempInput; + tempInput.Attach(collArgs.outputCpu); + + for (int rank = 0; rank < collArgs.totalRanks; ++rank) + { + tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false); + + // Copy input + if (rank == collArgs.globalRank) + { + CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice)); + } + memcpy(collArgs.expected.U1 + (numBytes * rank), tempInput.U1 + (numBytes * collArgs.globalRank), numBytes); + } + return TEST_SUCCESS; + } +} diff --git a/test/common/PrepDataFuncs.hpp b/test/common/PrepDataFuncs.hpp new file mode 100644 index 0000000000..34b17c30d4 --- /dev/null +++ b/test/common/PrepDataFuncs.hpp @@ -0,0 +1,26 @@ + /************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#pragma once +#include "ErrCode.hpp" + +namespace RcclUnitTesting +{ + class CollectiveArgs; + + // Checks that enough memory has been allocated + ErrCode CheckAllocation(CollectiveArgs const& collArgs); + + // Default PrepareData functions + // PrepareData functions are responsible for setting up input / expected for the given collArgs + ErrCode DefaultPrepareDataFunc(CollectiveArgs &collArgs); + ErrCode DefaultPrepData_Broadcast(CollectiveArgs &collArgs); + ErrCode DefaultPrepData_Reduce(CollectiveArgs &collArgs, bool const isAllReduce); + ErrCode DefaultPrepData_Gather(CollectiveArgs &collArgs, bool const isAllGather); + ErrCode DefaultPrepData_ReduceScatter(CollectiveArgs &collArgs); + ErrCode DefaultPrepData_Scatter(CollectiveArgs &collArgs); + ErrCode DefaultPrepData_AllToAll(CollectiveArgs &collArgs); + ErrCode DefaultPrepData_SendRecv(CollectiveArgs &collArgs); +} diff --git a/test/common/PtrUnion.cpp b/test/common/PtrUnion.cpp new file mode 100644 index 0000000000..82eec6a524 --- /dev/null +++ b/test/common/PtrUnion.cpp @@ -0,0 +1,354 @@ +#include "PtrUnion.hpp" + +namespace RcclUnitTesting +{ + size_t DataTypeToBytes(ncclDataType_t const dataType) + { + switch (dataType) + { + case ncclInt8: return 1; + case ncclUint8: return 1; + case ncclInt32: return 4; + case ncclUint32: return 4; + case ncclInt64: return 8; + case ncclUint64: return 8; + case ncclFloat16: return 2; + case ncclFloat32: return 4; + case ncclFloat64: return 8; + case ncclBfloat16: return 2; + default: + ERROR("Unsupported datatype (%d)\n", dataType); + exit(0); + } + } + + ErrCode PtrUnion::Attach(void *ptr) + { + this->ptr = ptr; + return TEST_SUCCESS; + } + + ErrCode PtrUnion::Attach(PtrUnion ptrUnion) + { + this->ptr = ptrUnion.ptr; + return TEST_SUCCESS; + } + + ErrCode PtrUnion::AllocateGpuMem(size_t const numBytes, bool const useManagedMem) + { + if (numBytes) + { + if (useManagedMem) + { + if (hipMallocManaged(&I1, numBytes) != hipSuccess) + { + ERROR("Unable to allocate managed memory of GPU memory (%lu bytes)\n", numBytes); + return TEST_FAIL; + } + } + else + { + if (hipMalloc(&I1, numBytes) != hipSuccess) + { + ERROR("Unable to allocate memory of GPU memory (%lu bytes)\n", numBytes); + return TEST_FAIL; + } + } + } + return TEST_SUCCESS; + } + + ErrCode PtrUnion::AllocateCpuMem(size_t const numBytes) + { + if (numBytes) + { + this->ptr = calloc(numBytes, 1); + if (!ptr) + { + ERROR("Unable to allocate memory (%lu bytes)\n", numBytes); + return TEST_FAIL; + } + } + return TEST_SUCCESS; + } + + ErrCode PtrUnion::FreeGpuMem() + { + if (this->ptr != nullptr) + { + hipFree(this->ptr); + this->ptr = nullptr; + } + return TEST_SUCCESS; + } + + ErrCode PtrUnion::FreeCpuMem() + { + if (this->ptr != nullptr) + { + free(this->ptr); + this->ptr = nullptr; + } + return TEST_SUCCESS; + } + + ErrCode PtrUnion::ClearGpuMem(size_t const numBytes) + { + if (hipMemset(this->ptr, 0, numBytes) != hipSuccess) + { + ERROR("Unable to call hipMemset\n"); + return TEST_FAIL; + } + return TEST_SUCCESS; + } + + ErrCode PtrUnion::ClearCpuMem(size_t const numBytes) + { + memset(this->ptr, 0, numBytes); + return TEST_SUCCESS; + } + + ErrCode PtrUnion::FillPattern(ncclDataType_t const dataType, + size_t const numElements, + int const globalRank, + bool const isGpuMem) + { + PtrUnion temp; + size_t const numBytes = numElements * DataTypeToBytes(dataType); + + // If this is GPU memory, create a CPU temp buffer otherwise fill CPU memory directly + if (isGpuMem) + temp.AllocateCpuMem(numBytes); + else + temp.Attach(this->ptr); + + for (int i = 0; i < numElements; i++) + { + int valueI = (globalRank + i) % 256; + double valueF = 1.0L/((double)valueI+1.0L); + temp.Set(dataType, i, valueI, valueF); + } + + // If this is GPU memory, copy from CPU temp buffer + if (isGpuMem) + { + if (hipMemcpy(this->ptr, temp.ptr, numBytes, hipMemcpyHostToDevice) != hipSuccess) + { + ERROR("Unable to fill input with pattern for rank %d\n", globalRank); + return TEST_FAIL; + } + temp.FreeCpuMem(); + } + + return TEST_SUCCESS; + } + + ErrCode PtrUnion::Set(ncclDataType_t const dataType, int const idx, int valueI, double valueF) + { + switch (dataType) + { + case ncclInt8: I1[idx] = valueI; break; + case ncclUint8: U1[idx] = valueI; break; + case ncclInt32: I4[idx] = valueI; break; + case ncclUint32: U4[idx] = valueI; break; + case ncclInt64: I8[idx] = valueI; break; + case ncclUint64: U8[idx] = valueI; break; + case ncclFloat32: F4[idx] = valueF; break; + case ncclFloat64: F8[idx] = valueF; break; + case ncclBfloat16: B2[idx] = rccl_bfloat16(static_cast(valueF)); break; + default: + ERROR("Unsupported datatype\n"); + return TEST_FAIL; + } + return TEST_SUCCESS; + } + + ErrCode PtrUnion::Get(ncclDataType_t const dataType, int const idx, int& valueI, double& valueF) const + { + switch (dataType) + { + case ncclInt8: valueI = I1[idx]; break; + case ncclUint8: valueI = I1[idx]; break; + case ncclInt32: valueI = I4[idx]; break; + case ncclUint32: valueI = U4[idx]; break; + case ncclInt64: valueI = I8[idx]; break; + case ncclUint64: valueI = U8[idx]; break; + case ncclFloat32: valueF = F4[idx]; break; + case ncclFloat64: valueF = F8[idx]; break; + case ncclBfloat16: valueF = B2[idx]; break; + default: + ERROR("Unsupported datatype\n"); + return TEST_FAIL; + } + return TEST_SUCCESS; + } + + // Multiplies in-place each element by scalarsPerRank[rank] + ErrCode PtrUnion::Scale(ncclDataType_t const dataType, + size_t const numElements, + PtrUnion const& scalarsPerRank, + int const rank) + { + // If no scalars are provided do nothing + if (scalarsPerRank.ptr == nullptr) return TEST_SUCCESS; + + for (size_t idx = 0; idx < numElements; ++idx) + { + switch (dataType) + { + case ncclInt8: I1[idx] *= scalarsPerRank.I1[rank]; break; + case ncclUint8: U1[idx] *= scalarsPerRank.U1[rank]; break; + case ncclInt32: I4[idx] *= scalarsPerRank.I4[rank]; break; + case ncclUint32: U4[idx] *= scalarsPerRank.U4[rank]; break; + case ncclInt64: I8[idx] *= scalarsPerRank.I8[rank]; break; + case ncclUint64: U8[idx] *= scalarsPerRank.U8[rank]; break; + case ncclFloat32: F4[idx] *= scalarsPerRank.F4[rank]; break; + case ncclFloat64: F8[idx] *= scalarsPerRank.F8[rank]; break; + case ncclBfloat16: B2[idx] *= scalarsPerRank.B2[rank]; break; + default: + ERROR("Unsupported datatype\n"); + return TEST_FAIL; + } + } + return TEST_SUCCESS; + } + + ErrCode PtrUnion::Reduce(ncclDataType_t const dataType, + size_t const numElements, + PtrUnion const& inputCpu, + ncclRedOp_t const op) + { + if (inputCpu.ptr == nullptr) + { + ERROR("Input pointer to Reduce should not be nullptr\n"); + return TEST_FAIL; + } + + for (size_t idx = 0; idx < numElements; ++idx) + { + switch (dataType) + { + case ncclInt8: I1[idx] = ReduceOp(op, I1[idx], inputCpu.I1[idx]); break; + case ncclUint8: U1[idx] = ReduceOp(op, U1[idx], inputCpu.U1[idx]); break; + case ncclInt32: I4[idx] = ReduceOp(op, I4[idx], inputCpu.I4[idx]); break; + case ncclUint32: U4[idx] = ReduceOp(op, U4[idx], inputCpu.U4[idx]); break; + case ncclInt64: I8[idx] = ReduceOp(op, I8[idx], inputCpu.I8[idx]); break; + case ncclUint64: U8[idx] = ReduceOp(op, U8[idx], inputCpu.U8[idx]); break; + case ncclFloat32: F4[idx] = ReduceOp(op, F4[idx], inputCpu.F4[idx]); break; + case ncclFloat64: F8[idx] = ReduceOp(op, F8[idx], inputCpu.F8[idx]); break; + case ncclBfloat16: B2[idx] = ReduceOp(op, B2[idx], inputCpu.B2[idx]); break; + default: + ERROR("Unsupported datatype\n"); + return TEST_FAIL; + } + } + return TEST_SUCCESS; + } + + + ErrCode PtrUnion::DivideByInt(ncclDataType_t const dataType, + size_t const numElements, + int const divisor) + { + for (size_t idx = 0; idx < numElements; ++idx) + { + switch (dataType) + { + case ncclInt8: I1[idx] /= divisor; break; + case ncclUint8: U1[idx] /= divisor; break; + case ncclInt32: I4[idx] /= divisor; break; + case ncclUint32: U4[idx] /= divisor; break; + case ncclInt64: I8[idx] /= divisor; break; + case ncclUint64: U8[idx] /= divisor; break; + case ncclFloat32: F4[idx] /= divisor; break; + case ncclFloat64: F8[idx] /= divisor; break; + case ncclBfloat16: B2[idx] = (rccl_bfloat16((float)(B2[idx]) / divisor)); break; + default: + ERROR("Unsupported datatype\n"); + return TEST_FAIL; + } + } + return TEST_SUCCESS; + } + + ErrCode PtrUnion::IsEqual(ncclDataType_t const dataType, + size_t const numElements, + PtrUnion const& expected, + bool const verbose, + bool& isMatch) + { + isMatch = true; + size_t idx = 0; + for (idx = 0; idx < numElements; ++idx) + { + switch (dataType) + { + case ncclInt8: isMatch = (I1[idx] == expected.I1[idx]); break; + case ncclUint8: isMatch = (U1[idx] == expected.U1[idx]); break; + case ncclInt32: isMatch = (I4[idx] == expected.I4[idx]); break; + case ncclUint32: isMatch = (U4[idx] == expected.U4[idx]); break; + case ncclInt64: isMatch = (I8[idx] == expected.I8[idx]); break; + case ncclUint64: isMatch = (U8[idx] == expected.U8[idx]); break; + case ncclFloat32: isMatch = (fabs(F4[idx] - expected.F4[idx]) < 1e-5); break; + case ncclFloat64: isMatch = (fabs(F8[idx] - expected.F8[idx]) < 1e-12); break; + case ncclBfloat16: isMatch = (fabs((float)B2[idx] - (float)expected.B2[idx]) < 9e-2); break; + default: + ERROR("Unsupported datatype\n"); + return TEST_FAIL; + } + if (!isMatch) break; + } + + if (verbose && !isMatch) + { + switch (dataType) + { + case ncclInt8: + ERROR("Expected output: %d. Actual output: %d at index %lu\n", expected.I1[idx], I1[idx], idx); break; + case ncclUint8: + ERROR("Expected output: %u. Actual output: %u at index %lu\n", expected.U1[idx], U1[idx], idx); break; + case ncclInt32: + ERROR("Expected output: %d. Actual output: %d at index %lu\n", expected.I4[idx], I4[idx], idx); break; + case ncclUint32: + ERROR("Expected output: %u. Actual output: %u at index %lu\n", expected.U4[idx], U4[idx], idx); break; + case ncclInt64: + ERROR("Expected output: %ld. Actual output: %ld at index %lu\n", expected.I8[idx], I8[idx], idx); break; + case ncclUint64: + ERROR("Expected output: %lu. Actual output: %lu at index %lu\n", expected.U8[idx], U8[idx], idx); break; + case ncclFloat32: + ERROR("Expected output: %f. Actual output: %f at index %lu\n", expected.F4[idx], F4[idx], idx); break; + case ncclFloat64: + ERROR("Expected output: %lf. Actual output: %lf at index %lu\n", expected.F8[idx], F8[idx], idx); break; + case ncclBfloat16: + ERROR("Expected output: %f. Actual output: %f at index %lu\n", (float)expected.B2[idx], (float)B2[idx], idx); break; + default: + break; + } + } + return TEST_SUCCESS; + } + + std::string PtrUnion::ToString(ncclDataType_t const dataType, + size_t const numElements) const + { + std::stringstream ss; + for (int i = 0; i < numElements; i++) + { + if (i) ss << " "; + switch (dataType) + { + case ncclInt8: ss << I1[i]; break; + case ncclUint8: ss << U1[i]; break; + case ncclInt32: ss << I4[i]; break; + case ncclUint32: ss << U4[i]; break; + case ncclInt64: ss << I8[i]; break; + case ncclUint64: ss << U8[i]; break; + case ncclFloat32: ss << F4[i]; break; + case ncclFloat64: ss << F8[i]; break; + case ncclBfloat16: ss << (float)B2[i]; break; + default: break; + } + } + return ss.str(); + } +} diff --git a/test/common/PtrUnion.hpp b/test/common/PtrUnion.hpp new file mode 100644 index 0000000000..36e5d6ec69 --- /dev/null +++ b/test/common/PtrUnion.hpp @@ -0,0 +1,90 @@ +#pragma once +#include "ErrCode.hpp" +#include "rccl.h" +#include "rccl_bfloat16.h" + +namespace RcclUnitTesting +{ + // Performs the various basic reduction operations + template + T ReduceOp(ncclRedOp_t const op, T const A, T const B) + { + switch (op) + { + case ncclSum: return A + B; + case ncclProd: return A * B; + case ncclMax: return std::max(A, B); + case ncclMin: return std::min(A, B); + default: + ERROR("Unsupported reduction operator (%d)\n", op); + exit(0); + } + } + + size_t DataTypeToBytes(ncclDataType_t const dataType); + + // PtrUnion encapsulates a pointer of all the different supported datatypes + // NOTE: Currently half-precision float tests are unsupported due to half + // being supported on GPU only and not host + union PtrUnion + { + void* ptr; + int8_t* I1; // ncclInt8 + uint8_t* U1; // ncclUint8 + int32_t* I4; // ncclInt32 + uint32_t* U4; // ncclUint32 + int64_t* I8; // ncclInt64 + uint64_t* U8; // ncclUint64 + float* F4; // ncclFloat32 + double* F8; // ncclFloat64 + rccl_bfloat16* B2; // ncclBfloat16 + + ErrCode Attach(void *ptr); + ErrCode Attach(PtrUnion ptrUnion); + + ErrCode AllocateGpuMem(size_t const numBytes, bool const useManagedMem = false); + ErrCode AllocateCpuMem(size_t const numBytes); + + ErrCode FreeGpuMem(); + ErrCode FreeCpuMem(); + + ErrCode ClearGpuMem(size_t const numBytes); + ErrCode ClearCpuMem(size_t const numBytes); + + ErrCode FillPattern(ncclDataType_t const dataType, + size_t const numElements, + int const globalRank, + bool const isGpuMem); + + ErrCode Set(ncclDataType_t const dataType, int const idx, int valueI, double valueF); + ErrCode Get(ncclDataType_t const dataType, int const idx, int& valueI, double& valueF) const; + + // Multiplies in-place each element by scalarsPerRank[rank] + ErrCode Scale(ncclDataType_t const dataType, + size_t const numElements, + PtrUnion const& scalarsPerRank, + int const rank); + + // Reduces input into this PtrUnion + ErrCode Reduce(ncclDataType_t const dataType, + size_t const numElements, + PtrUnion const& inputCpu, + ncclRedOp_t const op); + + // Divide each element by a integer value + ErrCode DivideByInt(ncclDataType_t const dataType, + size_t const numElements, + int const divisor); + + // Compares for equality (fuzzy comparision for floating point types) + ErrCode IsEqual(ncclDataType_t const dataType, + size_t const numElements, + PtrUnion const& expected, + bool const verbose, + bool& isMatch); + + // Output to string (for debug) + std::string ToString(ncclDataType_t const dataType, + size_t const numElements) const; + }; +} diff --git a/test/common/TestBed.cpp b/test/common/TestBed.cpp new file mode 100644 index 0000000000..53539cb251 --- /dev/null +++ b/test/common/TestBed.cpp @@ -0,0 +1,485 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#include +#include "TestBed.hpp" +#include + +#define PIPE_WRITE(childId, val) \ + ASSERT_EQ(write(childList[childId]->parentWriteFd, &val, sizeof(val)), sizeof(val)) + +#define PIPE_CHECK(childId) \ + { \ + int response = 0; \ + ASSERT_EQ(read(childList[childId]->parentReadFd, &response, sizeof(int)), sizeof(int)); \ + ASSERT_EQ(response, TEST_SUCCESS); \ + } + +namespace RcclUnitTesting +{ + TestBed::TestBed() : + numDevicesAvailable(0), + numActiveChildren(0), + numActiveRanks(0) + { + // Set NCCL_COMM_ID to use a local port to avoid passing ncclCommId + // Calling ncclGetUniqueId would initialize HIP, which should not be done prior to fork + std::string localPort = "55513"; + if (!getenv("NCCL_COMM_ID")) + { + char hostname[HOST_NAME_MAX+1]; + gethostname(hostname, HOST_NAME_MAX+1); + std::string hostnameString(hostname); + hostnameString.append(":55513"); + setenv("NCCL_COMM_ID", hostnameString.c_str(), 0); + if (ev.verbose) INFO("NCCL_COMM_ID set to %s\n", hostnameString.c_str()); + } + + // Collect the number of GPUs + this->numDevicesAvailable = ev.maxGpus; + if (ev.verbose) INFO("Detected %d GPUs\n", this->numDevicesAvailable); + + // Create the maximum number of possible child processes (1 per GPU) + // Parent and child communicate via pipes + childList.resize(this->numDevicesAvailable); + for (int childId = 0; childId < this->numDevicesAvailable; ++childId) + { + childList[childId] = new TestBedChild(childId, ev.verbose, ev.printValues); + if (childList[childId]->InitPipes() != TEST_SUCCESS) + { + ERROR("Unable to create pipes to child process\n"); + return; + } + + pid_t pid = fork(); + if (pid == 0) + { + // Child process enters execution loop + childList[childId]->StartExecutionLoop(); + return; + } + else + { + // Parent records child process ID and closes unused ends of pipe + childList[childId]->pid = pid; + close(childList[childId]->childWriteFd); + close(childList[childId]->childReadFd); + } + } + } + + void TestBed::InitComms(std::vector> const& deviceIdsPerProcess, + int const numCollectivesInGroup) + { + // Count up the total number of GPUs to use and track child/deviceId per rank + this->numActiveChildren = deviceIdsPerProcess.size(); + this->numActiveRanks = 0; + this->numCollectivesInGroup = numCollectivesInGroup; + this->rankToChildMap.clear(); + this->rankToDeviceMap.clear(); + if (ev.verbose) INFO("Setting up %d active child processes\n", this->numActiveChildren); + for (int childId = 0; childId < this->numActiveChildren; ++childId) + { + for (auto i = 0; i < deviceIdsPerProcess[childId].size(); ++i) + { + this->rankToChildMap.push_back(childId); + this->rankToDeviceMap.push_back(deviceIdsPerProcess[childId][i]); + ++this->numActiveRanks; + } + } + + // Send InitComms command to each active child process + int const cmd = TestBedChild::CHILD_INIT_COMMS; + int rankOffset = 0; + for (int childId = 0; childId < this->numActiveChildren; ++childId) + { + PIPE_WRITE(childId, cmd); + + // Send total number of ranks to child process + PIPE_WRITE(childId, this->numActiveRanks); + + // Send the rank offset for this child process + PIPE_WRITE(childId, rankOffset); + + // Send the number of collectives to be run per group call + PIPE_WRITE(childId, numCollectivesInGroup); + + // Send the GPUs this child uses + int const numGpus = deviceIdsPerProcess[childId].size(); + PIPE_WRITE(childId, numGpus); + for (int i = 0; i < numGpus; i++) + PIPE_WRITE(childId, deviceIdsPerProcess[childId][i]); + + rankOffset += numGpus; + } + + // Wait for child acknowledgement + // This is done after previous loop to avoid deadlock as every rank needs to enter ncclInitCommRank + for (int childId = 0; childId < this->numActiveChildren; ++childId) + { + PIPE_CHECK(childId); + } + } + + void TestBed::InitComms(int const numGpus, int const numCollectivesInGroup) + { + InitComms(TestBed::GetDeviceIdsList(1, numGpus), numCollectivesInGroup); + } + + void TestBed::SetCollectiveArgs(ncclFunc_t const funcType, + ncclDataType_t const dataType, + ncclRedOp_t const redOp, + int const root, + size_t const numInputElements, + size_t const numOutputElements, + int const collId, + int const rank, + PtrUnion const scalarsPerRank, + int const scalarMode) + { + // Build list of ranks this applies to (-1 for rank means to set for all) + std::vector rankList; + for (int i = 0; i < this->numActiveRanks; ++i) + if (rank == -1 || rank == i) rankList.push_back(i); + + ScalarTransport scalarTransport; + if (scalarMode >= 0) + { + ASSERT_TRUE(scalarsPerRank.ptr != NULL); + + // Capture scalars per rank in format to share with child processes + int const numBytes = this->numActiveRanks * DataTypeToBytes(dataType); + memcpy(scalarTransport.ptr, scalarsPerRank.ptr, numBytes); + } + + // Loop over all ranks and send CollectiveArgs to appropriate child process + int const cmd = TestBedChild::CHILD_SET_COLL_ARGS; + for (auto currRank : rankList) + { + int const childId = rankToChildMap[currRank]; + PIPE_WRITE(childId, cmd); + PIPE_WRITE(childId, currRank); + PIPE_WRITE(childId, collId); + PIPE_WRITE(childId, funcType); + PIPE_WRITE(childId, dataType); + PIPE_WRITE(childId, redOp); + PIPE_WRITE(childId, root); + PIPE_WRITE(childId, numInputElements); + PIPE_WRITE(childId, numOutputElements); + PIPE_WRITE(childId, scalarMode); + PIPE_WRITE(childId, scalarTransport); + PIPE_CHECK(childId); + } + } + + void TestBed::AllocateMem(bool const inPlace, + bool const useManagedMem, + int const collId, + int const rank) + { + // Build list of ranks this applies to (-1 for rank means to set for all) + std::vector rankList; + for (int i = 0; i < this->numActiveRanks; ++i) + if (rank == -1 || rank == i) rankList.push_back(i); + + // Loop over all ranks and send allocation command to appropriate child process + int const cmd = TestBedChild::CHILD_ALLOCATE_MEM; + for (auto currRank : rankList) + { + int const childId = rankToChildMap[currRank]; + PIPE_WRITE(childId, cmd); + PIPE_WRITE(childId, currRank); + PIPE_WRITE(childId, collId); + PIPE_WRITE(childId, inPlace); + PIPE_WRITE(childId, useManagedMem); + PIPE_CHECK(childId); + } + } + + void TestBed::PrepareData(int const collId, + int const rank, + CollFuncPtr const prepDataFunc) + { + // Build list of ranks this applies to (-1 for rank means to set for all) + std::vector rankList; + for (int i = 0; i < this->numActiveRanks; ++i) + if (rank == -1 || rank == i) rankList.push_back(i); + + // Loop over all ranks and send prepare data command to appropriate child process + int const cmd = TestBedChild::CHILD_PREPARE_DATA; + for (auto currRank : rankList) + { + int const childId = rankToChildMap[currRank]; + PIPE_WRITE(childId, cmd); + PIPE_WRITE(childId, currRank); + PIPE_WRITE(childId, collId); + PIPE_WRITE(childId, prepDataFunc); + PIPE_CHECK(childId); + } + } + + void TestBed::ExecuteCollectives() + { + int const cmd = TestBedChild::CHILD_EXECUTE_COLL; + ++TestBed::NumTestsRun(); + + // Send ExecuteColl command to each active child process + for (int childId = 0; childId < this->numActiveChildren; ++childId) + { + PIPE_WRITE(childId, cmd); + } + + // Wait for child acknowledgement + for (int childId = 0; childId < this->numActiveChildren; ++childId) + { + PIPE_CHECK(childId); + } + } + + void TestBed::ValidateResults(bool& isCorrect, int const collId, int const rank) + { + // Build list of ranks this applies to (-1 for rank means to set for all) + std::vector rankList; + for (int i = 0; i < this->numActiveRanks; ++i) + if (rank == -1 || rank == i) rankList.push_back(i); + + int const cmd = TestBedChild::CHILD_VALIDATE_RESULTS; + + isCorrect = true; + // Send ValidateResults command to each active child process + for (auto currRank : rankList) + { + int const childId = rankToChildMap[currRank]; + PIPE_WRITE(childId, cmd); + PIPE_WRITE(childId, currRank); + PIPE_WRITE(childId, collId); + + int response = 0; + ASSERT_EQ(read(childList[childId]->parentReadFd, &response, sizeof(int)), sizeof(int)); + isCorrect &= (response == TEST_SUCCESS); + } + + ASSERT_EQ(isCorrect, true) << "Output does not match expected"; + } + + void TestBed::DeallocateMem(int const collId, int const rank) + { + // Build list of ranks this applies to (-1 for rank means to set for all) + std::vector rankList; + for (int i = 0; i < this->numActiveRanks; ++i) + if (rank == -1 || rank == i) rankList.push_back(i); + + int const cmd = TestBedChild::CHILD_DEALLOCATE_MEM; + + for (auto currRank : rankList) + { + int const childId = rankToChildMap[currRank]; + PIPE_WRITE(childId, cmd); + PIPE_WRITE(childId, currRank); + PIPE_WRITE(childId, collId); + PIPE_CHECK(childId); + } + } + + void TestBed::DestroyComms() + { + int const cmd = TestBedChild::CHILD_DESTROY_COMMS; + for (int childId = 0; childId < this->numActiveChildren; ++childId) + { + // Send DestroyComms command to each active child process + PIPE_WRITE(childId, cmd); + + // Wait for child acknowledgement + PIPE_CHECK(childId); + } + + // Reset bookkeeping + this->numActiveChildren = 0; + this->numActiveRanks = 0; + this->numCollectivesInGroup = 0; + } + + void TestBed::Finalize() + { + // Send Stop to all child processes + int const cmd = TestBedChild::CHILD_STOP; + for (int childId = 0; childId < this->numDevicesAvailable; ++childId) + { + PIPE_WRITE(childId, cmd); + + // Close pipes to child process + close(childList[childId]->parentWriteFd); + close(childList[childId]->parentReadFd); + } + this->numDevicesAvailable = 0; + } + + TestBed::~TestBed() + { + Finalize(); + } + + std::vector const& TestBed::GetAllSupportedRedOps() + { + return ev.GetAllSupportedRedOps(); + } + + std::vector const& TestBed::GetAllSupportedDataTypes() + { + return ev.GetAllSupportedDataTypes(); + } + + std::vector> TestBed::GetDeviceIdsList(int const numProcesses, + int const numGpus) + { + std::vector> result(numProcesses); + for (int i = 0; i < numGpus; i++) + result[i % numProcesses].push_back(i); + return result; + } + + std::string TestBed::GetTestCaseName(int const totalRanks, + bool const isMultiProcess, + ncclFunc_t const funcType, + ncclDataType_t const dataType, + ncclRedOp_t const redOp, + int const root, + bool const inPlace, + bool const managedMem) + { + std::stringstream ss; + ss << (isMultiProcess ? "MP" : "SP") << " "; + ss << totalRanks << " ranks "; + ss << ncclFuncNames[funcType] << " "; + ss << "(" << (inPlace ? "IP" : "OP") << "," << (managedMem ? "MM" : "GM") << ") "; + ss << ncclDataTypeNames[dataType] << " "; + if (CollectiveArgs::UsesReduce(funcType)) ss << ncclRedOpNames[redOp] << " "; + if (CollectiveArgs::UsesRoot(funcType)) ss << "Root " << root << " "; + return ss.str(); + } + + void TestBed::RunSimpleSweep(std::vector const& funcTypes, + std::vector const& tmpDataTypes, + std::vector const& tmpRedOps, + std::vector const& roots, + std::vector const& numElements, + std::vector const& inPlaceList, + std::vector const& managedMemList) + { + // Sort numElements in descending order to cut down on # of allocations + std::vector sortedN = numElements; + std::sort(sortedN.rbegin(), sortedN.rend()); + + // Filter out any unsupported datatypes, in case only subset has been compiled for + std::vector const& supportedDataTypes = this->GetAllSupportedDataTypes(); + std::vector dataTypes; + for (auto dt : tmpDataTypes) + { + for (int i = 0; i < supportedDataTypes.size(); ++i) + { + if (supportedDataTypes[i] == dt) + { + dataTypes.push_back(dt); + break; + } + } + } + + // Filter out any unsupported reduction ops, in case only subset has been compiled for + std::vector const& supportedOps = this->GetAllSupportedRedOps(); + std::vector redOps; + for (auto redop : tmpRedOps) + { + for (int i = 0; i < supportedOps.size(); ++i) + { + if (supportedOps[i] == redop) + { + redOps.push_back(redop); + break; + } + } + } + + bool isCorrect = true; + + // Sweep over the number of ranks + for (int totalRanks = ev.minGpus; totalRanks <= ev.maxGpus && isCorrect; ++totalRanks) + for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess) + { + if (!(ev.processMask & (1 << isMultiProcess))) continue; + + // Test either single process all GPUs, or 1 process per GPU + int const numProcesses = isMultiProcess ? totalRanks : 1; + this->InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks)); + + for (int ftIdx = 0; ftIdx < funcTypes.size() && isCorrect; ++ftIdx) + for (int dtIdx = 0; dtIdx < dataTypes.size() && isCorrect; ++dtIdx) + for (int rdIdx = 0; rdIdx < redOps.size() && isCorrect; ++rdIdx) + for (int rtIdx = 0; rtIdx < roots.size() && isCorrect; ++rtIdx) + for (int ipIdx = 0; ipIdx < inPlaceList.size() && isCorrect; ++ipIdx) + for (int mmIdx = 0; mmIdx < managedMemList.size() && isCorrect; ++mmIdx) + { + if (ev.showNames) + { + std::string name = this->GetTestCaseName(totalRanks, isMultiProcess, + funcTypes[ftIdx], dataTypes[dtIdx], + redOps[rdIdx], roots[rtIdx], + inPlaceList[ipIdx], managedMemList[mmIdx]); + INFO("%s\n", name.c_str()); + } + + for (int neIdx = 0; neIdx < numElements.size() && isCorrect; ++neIdx) + { + int numInputElements, numOutputElements; + CollectiveArgs::GetNumElementsForFuncType(funcTypes[ftIdx], + sortedN[neIdx], + totalRanks, + &numInputElements, + &numOutputElements); + + this->SetCollectiveArgs(funcTypes[ftIdx], + dataTypes[dtIdx], + redOps[rdIdx], + roots[rtIdx], + numInputElements, + numOutputElements); + + // Only allocate once for largest size + if (neIdx == 0) this->AllocateMem(inPlaceList[ipIdx], managedMemList[mmIdx]); + + // There are some cases when data does not need to be re-prepared + // e.g. AllReduce subarray expected results are still valid + bool canSkip = (neIdx != 0 && !inPlaceList[ipIdx] && + (funcTypes[ftIdx] == ncclCollBroadcast || + funcTypes[ftIdx] == ncclCollReduce || + funcTypes[ftIdx] == ncclCollAllReduce)); + if (!canSkip) this->PrepareData(); + + this->ExecuteCollectives(); + this->ValidateResults(isCorrect); + if (!isCorrect) + { + std::string name = this->GetTestCaseName(totalRanks, isMultiProcess, + funcTypes[ftIdx], dataTypes[dtIdx], + redOps[rdIdx], roots[rtIdx], + inPlaceList[ipIdx], managedMemList[mmIdx]); + ERROR("Incorrect output for %s\n", name.c_str()); + } + } + this->DeallocateMem(); + } + this->DestroyComms(); + } + } + + int& TestBed::NumTestsRun() + { + static int numTestsRun = 0; + return numTestsRun; + } +} + +#undef PIPE_WRITE +#undef PIPE_CHECK diff --git a/test/common/TestBed.hpp b/test/common/TestBed.hpp new file mode 100644 index 0000000000..b4555fcab9 --- /dev/null +++ b/test/common/TestBed.hpp @@ -0,0 +1,129 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#pragma once +#include +#include "CollectiveArgs.hpp" +#include "TestBedChild.hpp" +#include "EnvVars.hpp" +#include + +namespace RcclUnitTesting +{ + // This class facilitates testing RCCL collectives across various process / device configurations + // + class TestBed + { + public: + int numDevicesAvailable; // # of devices detected on node + std::vector childList; // List of child processes + std::vector rankToChildMap; // Tracks which child process each rank is assigned to + std::vector rankToDeviceMap; // Tracks which device each rank is assigned to + int numActiveChildren; // List of active children (with usable RCCL comms) + int numActiveRanks; // Current # of ranks in use + int numCollectivesInGroup; // # of collectives to execute per group call + + EnvVars ev; // Environment variables + + // Constructor - Creates one child process per detected GPU device that waits for further commands + TestBed(); + + // Prepare TestBed for use with GPUs across multiple child processes + void InitComms(std::vector> const& deviceIdsPerChild, int const numCollectivesInGroup = 1); + // Prepare TestBed for use with GPUs on a single child process + void InitComms(int const numGpus, int const numCollectivesInGroup = 1); + + // Set collectives arguments for specified collective / rank + // Setting scalarsPerRank to non-null will create custom reduction operator + // Using collId = -1 (default) applies settings to all collectives in group + // Using rank = -1 (default) applies settings to all ranks + + void SetCollectiveArgs(ncclFunc_t const funcType, + ncclDataType_t const dataType, + ncclRedOp_t const redOp, + int const root, + size_t const numInputElements, + size_t const numOutputElements, + int const collId = -1, + int const rank = -1, + PtrUnion const scalarsPerRank = {nullptr}, + int const scalarMode = -1); + + // Allocate memory for specified collective / rank + // - Requires SetCollectiveArgs to have been called already + // Using collId = -1 (default) applies settings to all collectives in group + // Using rank = -1 (default) applies settings to all ranks + void AllocateMem(bool const inPlace = false, + bool const useManagedMem = false, + int const collId = -1, + int const rank = -1); + + // Initialize input and compute expected results + // - requires that SetCollectiveArgs and AllocateMemory have already been called + // Setting collId to -1 applies settings to all collectives in group + // Setting rank to -1 applies settings to all ranks + // Setting prepDataFunc to nullptr uses the default fill pattern routine + void PrepareData(int const collId = -1, + int const rank = -1, + CollFuncPtr const prepDataFunc = nullptr); + + // Execute all collectives on all test children + // Blocks until collective is completed + void ExecuteCollectives(); + + // Perform results validation - compare output to expected + void ValidateResults(bool& isCorrect, int collId = -1, int const rank = -1); + + // Release allocated memory + void DeallocateMem(int collId = -1, int const rank = -1); + + // Release the RCCL comms + void DestroyComms(); + + // Explicit TestBed destructor that releases all child processes + // No further calls to TestBed should be performed after this call + void Finalize(); + + // Destructor - Calls Finalize() to release all child processes + ~TestBed(); + + // Returns all the supported reduction operations based on build settings + std::vector const& GetAllSupportedRedOps(); + + // Return all the supported data types based on build settings + std::vector const& GetAllSupportedDataTypes(); + + // Helper function that splits up GPUs to the given number of processes + static std::vector> GetDeviceIdsList(int const numProcesses, + int const numGpus); + + // Generate a test case name + static std::string GetTestCaseName(int const totalRanks, + bool const isMultiProcess, + ncclFunc_t const funcType, + ncclDataType_t const dataType, + ncclRedOp_t const redOp, + int const root, + bool const inPlace, + bool const managedMem); + + // Run a simple sweep + void RunSimpleSweep(std::vector const& funcTypes, + std::vector const& dataTypes, + std::vector const& redOps, + std::vector const& roots, + std::vector const& numElements, + std::vector const& inPlaceList, + std::vector const& managedMemList); + + // Used to track total number of calls to ExecuteCollectives() + static int& NumTestsRun(); + + protected: + // Ends the specified child process + void StopChild(int const childId); + }; +} diff --git a/test/common/TestBedChild.cpp b/test/common/TestBedChild.cpp new file mode 100644 index 0000000000..2e16cc9772 --- /dev/null +++ b/test/common/TestBedChild.cpp @@ -0,0 +1,589 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "TestBedChild.hpp" +#include + +#define CHILD_NCCL_CALL(cmd, msg) \ + { \ + if (this->verbose) printf("[ NCCL CALL] " #cmd "\n"); \ + ncclResult_t status = cmd; \ + if (status != ncclSuccess) \ + { \ + ERROR("Child process %d fails NCCL call %s with code %d\n", this->childId, msg, status); \ + return TEST_FAIL; \ + } \ + } + +#define PIPE_READ(val) \ + if (read(childReadFd, &val, sizeof(val)) != sizeof(val)) return TEST_FAIL; + +namespace RcclUnitTesting +{ + TestBedChild::TestBedChild(int const childId, bool const verbose, int const printValues) + { + this->childId = childId; + this->verbose = verbose; + this->printValues = printValues; + } + + int TestBedChild::InitPipes() + { + // Prepare parent->child pipe + int pipefd[2]; + if (pipe(pipefd) == -1) + { + ERROR("Unable to create parent->child pipe for child %d\n", this->childId); + return TEST_FAIL; + } + this->childReadFd = pipefd[0]; + this->parentWriteFd = pipefd[1]; + + // Prepare child->parent pipe + this->parentReadFd = -1; + if (pipe(pipefd) == -1) + { + ERROR("Unable to create parent->child pipe for child %d\n", this->childId); + return TEST_FAIL; + } + this->parentReadFd = pipefd[0]; + this->childWriteFd = pipefd[1]; + return TEST_SUCCESS; + } + + void TestBedChild::StartExecutionLoop() + { + // Close unused ends of pipes + close(this->parentWriteFd); + close(this->parentReadFd); + + // Wait for commands from parent process + if (verbose) INFO("Child %d enters execution loop\n", this->childId); + int command; + while (read(childReadFd, &command, sizeof(command)) > 0) + { + if (verbose) INFO("Child %d received command [%s]:\n", this->childId, ChildCommandNames[command]);; + ErrCode status = TEST_SUCCESS; + switch(command) + { + case CHILD_INIT_COMMS : status = InitComms(); break; + case CHILD_SET_COLL_ARGS : status = SetCollectiveArgs(); break; + case CHILD_ALLOCATE_MEM : status = AllocateMem(); break; + case CHILD_PREPARE_DATA : status = PrepareData(); break; + case CHILD_EXECUTE_COLL : status = ExecuteCollectives(); break; + case CHILD_VALIDATE_RESULTS: status = ValidateResults(); break; + case CHILD_DEALLOCATE_MEM : status = DeallocateMem(); break; + case CHILD_DESTROY_COMMS : status = DestroyComms(); break; + case CHILD_STOP : status = Stop(); break; + default: exit(0); + } + + // Send back acknowledgement to parent + if (status == TEST_FAIL) + ERROR("Child %d failed on command [%s]:\n", this->childId, ChildCommandNames[command]); + write(childWriteFd, &status, sizeof(status)); + } + + // Close child ends of pipe + close(this->childReadFd); + close(this->childWriteFd); + + exit(0); + } + + ErrCode TestBedChild::InitComms() + { + if (this->verbose) INFO("Child %d begins InitComms()\n", this->childId); + + // Read values sent by parent [see TestBed::InitComms()] + PIPE_READ(this->totalRanks); + PIPE_READ(this->rankOffset); + PIPE_READ(this->numCollectivesInGroup); + + // Read the GPUs this child uses and prepare storage for collective args / datasets + int numGpus; + PIPE_READ(numGpus); + this->deviceIds.resize(numGpus); + this->streams.resize(numGpus); + this->collArgs.resize(numGpus); + for (int i = 0; i < numGpus; i++) + { + PIPE_READ(this->deviceIds[i]); + this->collArgs[i].clear(); + this->collArgs[i].resize(numCollectivesInGroup); + } + + // Collect uniqueId (specified by NCCL_COMM_ID env var) + ncclUniqueId id; + CHILD_NCCL_CALL(ncclGetUniqueId(&id), "ncclGetUniqueId"); + + // Initialize communicators + comms.clear(); + comms.resize(numGpus); + + // Initialize within a group call to avoid deadlock when using multiple ranks per child + ErrCode status = TEST_SUCCESS; + CHILD_NCCL_CALL(ncclGroupStart(), "ncclGroupStart"); + for (int localRank = 0; localRank < numGpus; ++localRank) + { + int const globalRank = this->rankOffset + localRank; + int const currGpu = this->deviceIds[localRank]; + + if (hipSetDevice(currGpu) != hipSuccess) + { + ERROR("Rank %d on child %d unable to switch to GPU %d\n", globalRank, this->childId, currGpu); + status = TEST_FAIL; + break; + } + + if (hipStreamCreate(&this->streams[localRank]) != hipSuccess) + { + ERROR("Rank %d on child %d unable to create stream for GPU %d\n", globalRank, this->childId, currGpu); + status = TEST_FAIL; + break; + } + + if (ncclCommInitRank(&this->comms[localRank], this->totalRanks, id, globalRank) != ncclSuccess) + { + ERROR("Rank %d on child %d unable to call ncclCommInitRank\n", globalRank, this->childId); + status = TEST_FAIL; + break; + } + } + if (status == TEST_SUCCESS) + { + CHILD_NCCL_CALL(ncclGroupEnd(), "ncclGroupStart"); + } + if (this->verbose) INFO("Child %d finishes InitComms() [%s]\n", + this->childId, status == TEST_SUCCESS ? "SUCCESS" : "FAIL"); + return status; + } + + ErrCode TestBedChild::SetCollectiveArgs() + { + if (this->verbose) INFO("Child %d begins SetCollectiveArgs()\n", this->childId); + + // Read values sent by parent [see TestBed::SetCollectiveArgs()] + int globalRank; + int collId; + ncclFunc_t funcType; + ncclDataType_t dataType; + ncclRedOp_t redOp; + int root; + size_t numInputElements; + size_t numOutputElements; + ScalarTransport scalarTransport; + int scalarMode; + + PIPE_READ(globalRank); + PIPE_READ(collId); + PIPE_READ(funcType); + PIPE_READ(dataType); + PIPE_READ(redOp); + PIPE_READ(root); + PIPE_READ(numInputElements); + PIPE_READ(numOutputElements); + PIPE_READ(scalarMode); + PIPE_READ(scalarTransport); + + for (int i = 0; i < this->totalRanks; i++) + { + PtrUnion scalarsPerRank; + scalarsPerRank.Attach(scalarTransport.ptr); + } + + if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank)) + { + ERROR("Child %d does not contain rank %d\n", this->childId, globalRank); + return TEST_FAIL; + } + int const localRank = globalRank - rankOffset; + CHECK_HIP(hipSetDevice(this->deviceIds[localRank])); + + for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx) + { + if (collId == -1 || collId == collIdx) + { + CollectiveArgs& collArg = this->collArgs[localRank][collIdx]; + CHECK_CALL(collArg.SetArgs(globalRank, this->totalRanks, + this->deviceIds[localRank], + funcType, dataType, redOp, root, + numInputElements, numOutputElements, + scalarTransport, scalarMode)); + if (this->verbose) INFO("Rank %d on child %d sets collective %d [%s]\n", + globalRank, this->childId, collIdx, + collArg.GetDescription().c_str()); + + // If pre-mult scalars are provided, then create a custom reduction operator + if (scalarMode >= 0) + { + CHILD_NCCL_CALL(ncclRedOpCreatePreMulSum(&collArg.redOp, + collArg.localScalar.ptr, + dataType, + (ncclScalarResidence_t)scalarMode, + this->comms[localRank]), + "ncclRedOpCreatePreMulSum"); + if (verbose) INFO("Child %d created custom redop %d for collective %d\n", + this->childId, collArg.redOp, collIdx); + } + } + } + if (this->verbose) INFO("Child %d finishes SetCollectiveArgs()\n", this->childId); + return TEST_SUCCESS; + } + + ErrCode TestBedChild::AllocateMem() + { + if (this->verbose) INFO("Child %d begins AllocateMem()\n", this->childId); + + // Read values sent by parent [see TestBed::AllocateMem()] + int globalRank; + int collId; + bool inPlace; + bool useManagedMem; + + PIPE_READ(globalRank); + PIPE_READ(collId); + PIPE_READ(inPlace); + PIPE_READ(useManagedMem); + + if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank)) + { + ERROR("Child %d does not contain rank %d\n", this->childId, globalRank); + return TEST_FAIL; + } + int const localRank = globalRank - rankOffset; + CHECK_HIP(hipSetDevice(this->deviceIds[localRank])); + + for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx) + { + if (collId == -1 || collId == collIdx) + { + CollectiveArgs& collArg = this->collArgs[localRank][collIdx]; + CHECK_CALL(collArg.AllocateMem(inPlace, useManagedMem)); + if (this->verbose) INFO("Rank %d on child %d allocates memory for collective %d on device %d (%s,%s) Input: %p Output %p\n", + globalRank, this->childId, collIdx, this->deviceIds[localRank], + inPlace ? "in-place" : "out-of-place", + useManagedMem ? "managed" : "unmanaged", + collArg.inputGpu.ptr, + collArg.outputGpu.ptr); + } + } + + if (this->verbose) INFO("Child %d finishes AllocateMem()\n", this->childId); + return TEST_SUCCESS; + } + + // Fill input memory with pre-known patterned based on rank + ErrCode TestBedChild::PrepareData() + { + if (this->verbose) INFO("Child %d begins PrepareData()\n", this->childId); + + // Read values sent by parent [see TestBed::PrepareData()] + int globalRank; + int collId; + CollFuncPtr prepDataFunc; + + PIPE_READ(globalRank); + PIPE_READ(collId); + PIPE_READ(prepDataFunc); + + if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank)) + { + ERROR("Child %d does not contain rank %d\n", this->childId, globalRank); + return TEST_FAIL; + } + + int const localRank = globalRank - rankOffset; + CHECK_HIP(hipSetDevice(this->deviceIds[localRank])); + + for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx) + { + if (collId == -1 || collId == collIdx) + { + if (this->verbose) INFO("Rank %d on child %d prepares data for collective %d\n", + globalRank, this->childId, collIdx); + CHECK_CALL(this->collArgs[localRank][collIdx].PrepareData(prepDataFunc)); + } + } + if (this->verbose) INFO("Child %d finishes PrepareData()\n", this->childId); + return TEST_SUCCESS; + } + + ErrCode TestBedChild::ExecuteCollectives() + { + if (this->verbose) INFO("Child %d begins ExecuteCollectives()\n", this->childId); + + // Start group call + CHILD_NCCL_CALL(ncclGroupStart(), "ncclGroupStart"); + + // Loop over all collectives to be executed in group call + for (int collId = 0; collId < this->numCollectivesInGroup; ++collId) + { + // Loop over all local ranks + for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank) + { + CHECK_HIP(hipSetDevice(this->deviceIds[localRank])); + + CollectiveArgs const& collArg = this->collArgs[localRank][collId]; + + if (this->printValues) + { + int const numInputElementsToPrint = (this->printValues < 0 ? collArg.numInputElements : this->printValues); + PtrUnion inputCpu; + size_t const numInputBytes = numInputElementsToPrint * DataTypeToBytes(collArg.dataType); + inputCpu.AllocateCpuMem(numInputBytes); + CHECK_HIP(hipMemcpy(inputCpu.ptr, collArg.inputGpu.ptr, numInputBytes, hipMemcpyDeviceToHost)); + printf("[ DEBUG ] Rank %02d Coll %d %-10s: %s\n", collArg.globalRank, collId, "Input", + inputCpu.ToString(collArg.dataType, numInputElementsToPrint).c_str()); + inputCpu.FreeCpuMem(); + + int const numOutputElementsToPrint = (this->printValues < 0 ? collArg.numOutputElements : this->printValues); + size_t const numOutputBytes = numOutputElementsToPrint * DataTypeToBytes(collArg.dataType); + CHECK_HIP(hipMemcpy(collArg.outputCpu.ptr, collArg.outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost)); + printf("[ DEBUG ] Rank %02d Coll %d %-10s: %s\n", collArg.globalRank, collId, "Pre-Output", + collArg.outputCpu.ToString(collArg.dataType, numOutputElementsToPrint).c_str()); + } + + switch (collArg.funcType) + { + case ncclCollBroadcast: + CHILD_NCCL_CALL(ncclBroadcast(collArg.inputGpu.ptr, + collArg.outputGpu.ptr, + collArg.numInputElements, + collArg.dataType, + collArg.root, + this->comms[localRank], + this->streams[localRank]), + "ncclBroadcast"); + break; + case ncclCollReduce: + CHILD_NCCL_CALL(ncclReduce(collArg.inputGpu.ptr, + collArg.outputGpu.ptr, + collArg.numInputElements, + collArg.dataType, + collArg.redOp, + collArg.root, + this->comms[localRank], + this->streams[localRank]), + "ncclReduce"); + break; + case ncclCollAllGather: + CHILD_NCCL_CALL(ncclAllGather(collArg.inputGpu.ptr, + collArg.outputGpu.ptr, + collArg.numInputElements, + collArg.dataType, + this->comms[localRank], + this->streams[localRank]), + "ncclAllGather"); + break; + case ncclCollReduceScatter: + CHILD_NCCL_CALL(ncclReduceScatter(collArg.inputGpu.ptr, + collArg.outputGpu.ptr, + collArg.numOutputElements, + collArg.dataType, + collArg.redOp, + this->comms[localRank], + this->streams[localRank]), + "ncclReduceScatter"); + break; + case ncclCollAllReduce: + CHILD_NCCL_CALL(ncclAllReduce(collArg.inputGpu.ptr, + collArg.outputGpu.ptr, + collArg.numInputElements, + collArg.dataType, + collArg.redOp, + this->comms[localRank], + this->streams[localRank]), + "ncclAllReduce"); + break; + case ncclCollGather: + CHILD_NCCL_CALL(ncclGather(collArg.inputGpu.ptr, + collArg.outputGpu.ptr, + collArg.numInputElements, + collArg.dataType, + collArg.root, + this->comms[localRank], + this->streams[localRank]), + "ncclGather"); + break; + case ncclCollScatter: + CHILD_NCCL_CALL(ncclScatter(collArg.inputGpu.ptr, + collArg.outputGpu.ptr, + collArg.numOutputElements, + collArg.dataType, + collArg.root, + this->comms[localRank], + this->streams[localRank]), + "ncclScatter"); + break; + case ncclCollAllToAll: + CHILD_NCCL_CALL(ncclAllToAll(collArg.inputGpu.ptr, + collArg.outputGpu.ptr, + collArg.numInputElements / collArg.totalRanks, + collArg.dataType, + this->comms[localRank], + this->streams[localRank]), + "ncclAllToAll"); + break; + case ncclCollSend: + CHILD_NCCL_CALL(ncclSend(collArg.inputGpu.ptr, + collArg.numInputElements, + collArg.dataType, + collArg.root, + this->comms[localRank], + this->streams[localRank]), + "ncclSend"); + break; + case ncclCollRecv: + CHILD_NCCL_CALL(ncclRecv(collArg.outputGpu.ptr, + collArg.numOutputElements, + collArg.dataType, + collArg.root, + this->comms[localRank], + this->streams[localRank]), + "ncclRecv"); + break; + default: + ERROR("Unknown func type %d\n", collArg.funcType); + return TEST_FAIL; + } + } + } + + // End group call + CHILD_NCCL_CALL(ncclGroupEnd(), "ncclGroupEnd"); + + // Synchronize + if (this->verbose) INFO("Child %d submits group call. Waiting for completion\n", this->childId); + for (int localRank = 0; localRank < this->streams.size(); ++localRank) + { + CHECK_HIP(hipStreamSynchronize(this->streams[localRank])); + } + + if (this->printValues) + { + for (int collId = 0; collId < this->numCollectivesInGroup; ++collId) + for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank) + { + CollectiveArgs const& collArg = this->collArgs[localRank][collId]; + + int numOutputElementsToPrint = (this->printValues < 0 ? collArg.numOutputElements : this->printValues); + size_t const numOutputBytes = numOutputElementsToPrint * DataTypeToBytes(collArg.dataType); + CHECK_HIP(hipMemcpy(collArg.outputCpu.ptr, collArg.outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost)); + printf("[ DEBUG ] Rank %02d Coll %d %-10s: %s\n", collArg.globalRank, collId, "Output", + collArg.outputCpu.ToString(collArg.dataType, numOutputElementsToPrint).c_str()); + + printf("[ DEBUG ] Rank %02d Coll %d %-10s: %s\n", collArg.globalRank, collId, "Expected", + collArg.expected.ToString(collArg.dataType, numOutputElementsToPrint).c_str()); + } + } + if (this->verbose) INFO("Child %d finishes ExecuteCollectives()\n", this->childId); + return TEST_SUCCESS; + } + + ErrCode TestBedChild::ValidateResults() + { + // Read values sent by parent [see TestBed::ValidateResults()] + int globalRank, collId; + PIPE_READ(globalRank); + PIPE_READ(collId); + + if (this->verbose) INFO("Child %d begins ValidateResults()\n", this->childId); + + if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank)) + { + ERROR("Child %d does not contain rank %d\n", this->childId, globalRank); + return TEST_FAIL; + } + int const localRank = globalRank - rankOffset; + CHECK_HIP(hipSetDevice(this->deviceIds[localRank])); + + ErrCode status = TEST_SUCCESS; + for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx) + { + if (collId == -1 || collId == collIdx) + { + if (this->verbose) INFO("Rank %d on child %d validating collective %d results\n", + globalRank, this->childId, collIdx); + if (this->collArgs[localRank][collIdx].ValidateResults() != TEST_SUCCESS) + { + ERROR("Rank %d Collective %d output does not match expected\n", globalRank, collIdx); + status = TEST_FAIL; + } + } + } + if (this->verbose) INFO("Child %d finishes ValidateResults() with status %s\n", this->childId, + status == TEST_SUCCESS ? "SUCCESS" : "FAIL"); + return status; + } + + ErrCode TestBedChild::DeallocateMem() + { + if (this->verbose) INFO("Child %d begins DeallocateMem\n", this->childId); + + // Read values sent by parent [see TestBed::DeallocateMem()] + int globalRank, collId; + PIPE_READ(globalRank); + PIPE_READ(collId); + + if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank)) + { + ERROR("Child %d does not contain rank %d\n", this->childId, globalRank); + return TEST_FAIL; + } + int const localRank = globalRank - rankOffset; + CHECK_HIP(hipSetDevice(this->deviceIds[localRank])); + + for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx) + { + CollectiveArgs& collArg = this->collArgs[localRank][collIdx]; + if (collId == -1 || collId == collIdx) + { + if (this->verbose) + { + INFO("Child %d release memory for collective %d (Input: %p Output %p\n", + this->childId, collIdx, collArg.inputGpu.ptr, collArg.outputGpu.ptr); + } + + CHECK_CALL(collArg.DeallocateMem()); + } + if (collArg.scalarMode != -1) + { + CHILD_NCCL_CALL(ncclRedOpDestroy(collArg.redOp, this->comms[localRank]), + "ncclRedOpDestroy"); + if (verbose) INFO("Child %d destroys custom redop %d for collective %d\n", + this->childId, collArg.redOp, collIdx); + } + } + if (this->verbose) INFO("Child %d finishes DeallocateMem\n", this->childId); + return TEST_SUCCESS; + } + + ErrCode TestBedChild::DestroyComms() + { + if (this->verbose) INFO("Child %d begins DestroyComms\n", this->childId); + + // Release comms + for (int i = 0; i < this->comms.size(); ++i) + { + CHILD_NCCL_CALL(ncclCommDestroy(this->comms[i]), "ncclCommDestroy"); + } + for (int i = 0; i < this->streams.size(); ++i) + { + CHECK_HIP(hipStreamDestroy(this->streams[i])); + } + this->comms.clear(); + this->streams.clear(); + if (this->verbose) INFO("Child %d finishes DestroyComms\n", this->childId); + return TEST_SUCCESS; + } + + ErrCode TestBedChild::Stop() + { + return TEST_SUCCESS; + } +} diff --git a/test/common/TestBedChild.hpp b/test/common/TestBedChild.hpp new file mode 100644 index 0000000000..b6a0d011e5 --- /dev/null +++ b/test/common/TestBedChild.hpp @@ -0,0 +1,106 @@ +/************************************************************************* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#pragma once + +#include +#include +#include "CollectiveArgs.hpp" +#include "rccl.h" + +#define MAX_RANKS 32 +namespace RcclUnitTesting +{ + class TestBedChild + { + public: + // These are commands that can be given to the child process + enum + { + CHILD_INIT_COMMS = 0, // InitComms() + CHILD_SET_COLL_ARGS = 1, // SetCollectiveArgs() + CHILD_ALLOCATE_MEM = 2, // AllocateMem() + CHILD_PREPARE_DATA = 3, // PrepareData() + CHILD_EXECUTE_COLL = 4, // ExecuteCollectives() + CHILD_VALIDATE_RESULTS = 5, // ValidateResults() + CHILD_DEALLOCATE_MEM = 6, // DeallocateMem() + CHILD_DESTROY_COMMS = 7, // DestroyComms() + CHILD_STOP = 8, // Stop() + NUM_CHILD_COMMANDS = 9 + }; + + char const ChildCommandNames[NUM_CHILD_COMMANDS][20] = + { + "INIT_COMMS", + "SET_COLL_ARGS", + "ALLOCATE_MEM", + "PREPARE_DATA", + "EXECUTE_COLL", + "VALIDATE_RESULTS", + "DEALLOCATE_MEM", + "DESTROY_COMMS", + "STOP" + }; + + // These variables remain constant for life of TestBedChild + int childId; + pid_t pid; + bool verbose; + int printValues; + + // Pipes used to communicate between parent process + int parentWriteFd; + int parentReadFd; + int childWriteFd; + int childReadFd; + + // These varibles may change based on commands issued by parent + int totalRanks; // Total ranks + int rankOffset; // Global rank offset for this child + int numCollectivesInGroup; // # of collectives to run per group call + std::vector comms; // RCCL communicators for each rank + std::vector deviceIds; // Device IDs for each rank + std::vector streams; // Streams for executing collectives + std::vector> collArgs; // Info for each collective for each rank + + // Constructor + TestBedChild(int const childId, bool const verbose, int const printValues); + + // Prepare parent/child communication pipes - to be executed by parent process + int InitPipes(); + + // Execution + void StartExecutionLoop(); + + protected: + // Initialize RCCL communicators + ErrCode InitComms(); + + // Set CollectiveArgs + ErrCode SetCollectiveArgs(); + + // Allocate memory (input (GPU) / output (GPU) / expected (CPU)) + ErrCode AllocateMem(); + + // Prepare input and expected data + ErrCode PrepareData(); + + // Execute a group of collectives + ErrCode ExecuteCollectives(); + + // Validate that output matches expected + ErrCode ValidateResults(); + + // Release allocated memory + ErrCode DeallocateMem(); + + // Destroys RCCL communicators + ErrCode DestroyComms(); + + // Stops this child process + ErrCode Stop(); + }; +} diff --git a/test/common/main.cpp b/test/common/main.cpp new file mode 100644 index 0000000000..f9ae576fee --- /dev/null +++ b/test/common/main.cpp @@ -0,0 +1,11 @@ +#include +#include "EnvVars.hpp" +#include "TestBed.hpp" +int main(int argc, char **argv) +{ + ::testing::InitGoogleTest(&argc, argv); + RcclUnitTesting::EnvVars::ShowConfig(); + int retCode = RUN_ALL_TESTS(); + printf("[ INFO ] Total executed cases: %d\n", RcclUnitTesting::TestBed::NumTestsRun()); + return retCode; +} diff --git a/test/test_AllGather.cpp b/test/test_AllGather.cpp deleted file mode 100644 index 280c1cea48..0000000000 --- a/test/test_AllGather.cpp +++ /dev/null @@ -1,117 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#include "test_AllGather.hpp" - -namespace CorrectnessTests -{ - TEST_P(AllGatherCorrectnessTest, Correctness) - { - // Adjust numElements to be multiple of numDevices - numElements = (numElements/numDevices)*numDevices; - if (numDevices > numDevicesAvailable) return; - if (numElements % numDevices != 0) return; - - // Prepare input / output / expected results - Dataset dataset; - dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllGather); - FillDatasetWithPattern(dataset); - ComputeExpectedResults(dataset); - - size_t const byteCount = dataset.NumBytes() / dataset.numDevices; - size_t const sendCount = dataset.numElements / dataset.numDevices; - - // Launch the reduction (1 thread per GPU) - ncclGroupStart(); - for (int i = 0; i < numDevices; i++) - { - ncclAllGather((int8_t *)dataset.inputs[i] + (i * byteCount), - dataset.outputs[i], sendCount, - dataType, comms[i], streams[i]); - } - ncclGroupEnd(); - - // Wait for reduction to complete - Synchronize(); - - // Check results - ValidateResults(dataset); - dataset.Release(); - } - - TEST_P(AllGatherCorrectnessTest, Alignment) - { - if (numDevices > numDevicesAvailable) return; - if (numElements % numDevices != 0) return; - - // Allocate dataset - Dataset dataset; - dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllGather); - - // Loop over several offsets (so that device pointers are not aligned) - for (int firstElement = 1; firstElement <= 11; firstElement += 2) - { - if (firstElement < numElements) - { - // Select last element so that total number of elements is multiple of numDevices - int const lastElement = firstElement + ((numElements - firstElement) / numDevices) * numDevices - 1; - if (lastElement >= numElements) break; - - Dataset subDataset; - dataset.ExtractSubDataset(firstElement, lastElement, subDataset); - - // Compute reference results for sub-dataset - FillDatasetWithPattern(subDataset); - ComputeExpectedResults(subDataset); - - size_t const byteCount = subDataset.NumBytes() / subDataset.numDevices; - size_t const sendCount = subDataset.numElements / subDataset.numDevices; - - // Launch the reduction (1 thread per GPU) - ncclGroupStart(); - for (int i = 0; i < numDevices; i++) - { - ncclAllGather((int8_t *)subDataset.inputs[i] + (i * byteCount), - subDataset.outputs[i], sendCount, - dataType, comms[i], streams[i]); - } - ncclGroupEnd(); - - // Wait for reduction to complete - Synchronize(); - - // Check results - ValidateResults(subDataset); - } - } - dataset.Release(); - } - - - INSTANTIATE_TEST_SUITE_P(AllGatherCorrectnessSweep, - AllGatherCorrectnessTest, - testing::Combine( - // Reduction operator (not used) - testing::Values(ncclSum), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(2520, 3026520), - // Number of devices - testing::Range(2,(GTESTS_NUM_GPUS+1)), - // In-place or not - testing::Values(false, true), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_AllGather.hpp b/test/test_AllGather.hpp deleted file mode 100644 index 06a0297c3d..0000000000 --- a/test/test_AllGather.hpp +++ /dev/null @@ -1,34 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_ALLGATHER_HPP -#define TEST_ALLGATHER_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class AllGatherCorrectnessTest : public CorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset) - { - size_t const byteCount = dataset.NumBytes() / dataset.numDevices; - - int8_t* result = (int8_t *)malloc(dataset.NumBytes()); - - for (int i = 0; i < dataset.numDevices; i++) - HIP_CALL(hipMemcpy(result + i * byteCount, (int8_t *)dataset.inputs[i] + (i * byteCount), - byteCount, hipMemcpyDeviceToHost)); - - for (int i = 0; i < dataset.numDevices; i++) - memcpy(dataset.expected[i], result, dataset.NumBytes()); - - free(result); - } - }; -} - -#endif diff --git a/test/test_AllGatherMultiProcess.cpp b/test/test_AllGatherMultiProcess.cpp deleted file mode 100644 index c6e84ae132..0000000000 --- a/test/test_AllGatherMultiProcess.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#include "test_AllGatherMultiProcess.hpp" - -namespace CorrectnessTests -{ - TEST_P(AllGatherMultiProcessCorrectnessTest, Correctness) - { - dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllGather); - std::vector pids(numDevices); - - int gpu = -1; - for (int i = 0; i < numDevices; i++) - { - gpu++; - int pid = fork(); - if (pid == 0) - { - bool pass; - TestAllGather(gpu, *dataset, pass); - TerminateChildProcess(pass); - } - else - { - pids[gpu] = pid; - } - } - - ValidateProcesses(pids); - dataset->ReleaseRootProcess(); - } - - INSTANTIATE_TEST_SUITE_P(AllGatherMultiProcessCorrectnessSweep, - AllGatherMultiProcessCorrectnessTest, - testing::Combine( - // Reduction operator (not used) - testing::Values(ncclSum), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(3072, 3145728), - // Number of devices - testing::Values(2,3,4,8), - // In-place or not - testing::Values(false, true), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_AllGatherMultiProcess.hpp b/test/test_AllGatherMultiProcess.hpp deleted file mode 100644 index 90a72624e2..0000000000 --- a/test/test_AllGatherMultiProcess.hpp +++ /dev/null @@ -1,81 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_ALLGATHER_MULTI_PROCESS_HPP -#define TEST_ALLGATHER_MULTI_PROCESS_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class AllGatherMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, int const numDevices, std::vector const& ranks) - { - size_t const byteCount = dataset.NumBytes() / dataset.numDevices; - - for (int i = 0; i < ranks.size(); i++) - { - int rank = ranks[i]; - HIP_CALL(hipMemcpy(static_cast(dataset.expected[0]) + rank * byteCount, (int8_t *)dataset.inputs[rank] + (rank * byteCount), - byteCount, hipMemcpyDeviceToHost)); - } - barrier.Wait(); - - // Rank 0 sends answer to other ranks - for (int i = 0; i < ranks.size(); i++) - { - int rank = ranks[i]; - if (rank == 0) - { - for (int i = 0; i < dataset.numDevices; i++) - { - if (i == rank) continue; - memcpy(dataset.expected[i], dataset.expected[0], dataset.NumBytes()); - } - } - } - } - - void TestAllGather(int rank, Dataset& dataset, bool& pass) - { - // Prepare input / output / expected results - SetUpPerProcess(rank, ncclCollAllGather, comms[rank], streams[rank], dataset); - - if (numDevices > numDevicesAvailable || numElements % numDevices != 0) - { - pass = true; - return; - } - - Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID")))); - - // Prepare input / output / expected results - FillDatasetWithPattern(dataset, rank); - - ComputeExpectedResults(dataset, barrier, numDevices, std::vector(1, rank)); - - size_t const byteCount = dataset.NumBytes() / numDevices; - size_t const sendCount = dataset.numElements / numDevices; - - // Launch the reduction (1 process per GPU) - ncclAllGather((int8_t *)dataset.inputs[rank] + (rank * byteCount), - dataset.outputs[rank], sendCount, - dataType, comms[rank], streams[rank]); - - // Wait for reduction to complete - HIP_CALL(hipStreamSynchronize(streams[rank])); - - // Check results - pass = ValidateResults(dataset, rank); - - TearDownPerProcess(comms[rank], streams[rank]); - dataset.Release(rank); - } - }; -} - -#endif diff --git a/test/test_AllReduce.cpp b/test/test_AllReduce.cpp deleted file mode 100644 index 3a8697bcc6..0000000000 --- a/test/test_AllReduce.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "test_AllReduce.hpp" - -namespace CorrectnessTests -{ - TEST_P(AllReduceCorrectnessTest, Correctness) - { - if (numDevices > numDevicesAvailable) return; - - // Prepare input / output / expected results - Dataset dataset; - dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce); - FillDatasetWithPattern(dataset); - ComputeExpectedResults(dataset, op); - - // Launch the reduction (1 thread per GPU) - ncclGroupStart(); - for (int i = 0; i < numDevices; i++) - { - ncclAllReduce(dataset.inputs[i], dataset.outputs[i], - numElements, dataType, op, comms[i], streams[i]); - } - ncclGroupEnd(); - - // Wait for reduction to complete - Synchronize(); - - // Check results - ValidateResults(dataset); - - dataset.Release(); - } -#if defined(BUILD_ALLREDUCE_ONLY) - INSTANTIATE_TEST_SUITE_P(AllReduceCorrectnessSweep, - AllReduceCorrectnessTest, - testing::Combine( - // Reduction operator - testing::Values(ncclSum), - // Data types - testing::Values(ncclFloat32), - // Number of elements - testing::Values(1024, 1048576), - // Number of devices - testing::Range(2,(GTESTS_NUM_GPUS+1)), - // In-place or not - testing::Values(false, true), - testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")), - CorrectnessTest::PrintToStringParamName()); -#else - INSTANTIATE_TEST_SUITE_P(AllReduceCorrectnessSweep, - AllReduceCorrectnessTest, - testing::Combine( - // Reduction operator - testing::Values(ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(1024, 1048576), - // Number of devices - testing::Range(2,(GTESTS_NUM_GPUS+1)), - // In-place or not - testing::Values(false, true), - testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")), - CorrectnessTest::PrintToStringParamName()); -#endif -} // namespace diff --git a/test/test_AllReduce.hpp b/test/test_AllReduce.hpp deleted file mode 100644 index 220877207b..0000000000 --- a/test/test_AllReduce.hpp +++ /dev/null @@ -1,83 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_ALLREDUCE_HPP -#define TEST_ALLREDUCE_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class AllReduceCorrectnessTest : public CorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op) - { - // Copy all inputs to expected arrays temporarily to perform reduction on host - for (int i = 0; i < dataset.numDevices; i++) - HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i], - dataset.NumBytes(), hipMemcpyDeviceToHost)); - - // Allocate temporary host array to accumulate results - int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes()); - uint8_t* resultU1 = (uint8_t *)resultI1; - int32_t* resultI4 = (int32_t *)resultI1; - uint32_t* resultU4 = (uint32_t *)resultI1; - int64_t* resultI8 = (int64_t *)resultI1; - uint64_t* resultU8 = (uint64_t *)resultI1; - float* resultF4 = (float *)resultI1; - double* resultF8 = (double *)resultI1; - rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1; - - // Initialize the result with the first device's array - memcpy(resultI1, dataset.expected[0], dataset.NumBytes()); - ncclRedOp_t red_op = ((op == ncclAvg) ? ncclSum : op); - - // Perform reduction on the other device arrays - for (int i = 1; i < dataset.numDevices; i++) - { - int8_t* arrayI1 = (int8_t *)dataset.expected[i]; - uint8_t* arrayU1 = (uint8_t *)arrayI1; - int32_t* arrayI4 = (int32_t *)arrayI1; - uint32_t* arrayU4 = (uint32_t *)arrayI1; - int64_t* arrayI8 = (int64_t *)arrayI1; - uint64_t* arrayU8 = (uint64_t *)arrayI1; - float* arrayF4 = (float *)arrayI1; - double* arrayF8 = (double *)arrayI1; - rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1; - - for (int j = 0; j < dataset.numElements; j++) - { - switch (dataset.dataType) - { - case ncclInt8: resultI1[j] = ReduceOp(red_op, resultI1[j], arrayI1[j]); break; - case ncclUint8: resultU1[j] = ReduceOp(red_op, resultU1[j], arrayU1[j]); break; - case ncclInt32: resultI4[j] = ReduceOp(red_op, resultI4[j], arrayI4[j]); break; - case ncclUint32: resultU4[j] = ReduceOp(red_op, resultU4[j], arrayU4[j]); break; - case ncclInt64: resultI8[j] = ReduceOp(red_op, resultI8[j], arrayI8[j]); break; - case ncclUint64: resultU8[j] = ReduceOp(red_op, resultU8[j], arrayU8[j]); break; - case ncclFloat32: resultF4[j] = ReduceOp(red_op, resultF4[j], arrayF4[j]); break; - case ncclFloat64: resultF8[j] = ReduceOp(red_op, resultF8[j], arrayF8[j]); break; - case ncclBfloat16: resultB2[j] = ReduceOp(red_op, resultB2[j], arrayB2[j]); break; - default: - fprintf(stderr, "[ERROR] Unsupported datatype\n"); - exit(0); - } - } - } - - if (op == ncclAvg) - Average(dataset, resultI1); - - // Copy results into expected arrays - for (int i = 0; i < dataset.numDevices; i++) - memcpy(dataset.expected[i], resultI1, dataset.NumBytes()); - - free(resultI1); - } - }; -} - -#endif diff --git a/test/test_AllReduceAbort.cpp b/test/test_AllReduceAbort.cpp deleted file mode 100644 index 64e34032ee..0000000000 --- a/test/test_AllReduceAbort.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "test_AllReduceAbort.hpp" -#include "../include/comm.h" - -#define NUM_ITER 8 -#define FAKE_OP_COUNT NUM_ITER+1 - -namespace CorrectnessTests -{ - #define HIPCHECK(cmd) \ - do { \ - hipError_t error = (cmd); \ - if (error != hipSuccess) { \ - std::cerr << "Encountered HIP error (" << error << ") at line " \ - << __LINE__ << " in file " << __FILE__ << "\n"; \ - exit(-1); \ - } \ - } while (0) - - #define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST) - #define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST) - - TEST_P(AllReduceAbortTest, Correctness) { - if (numDevices > numDevicesAvailable) return; - - // Prepare input / output / expected results - Dataset dataset; - dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce); - FillDatasetWithPattern(dataset); - - int gpu = 0; // GPU number to trigger abort - ncclComm_t comm = comms[gpu]; - - HIPCHECK(hipSetDevice(gpu)); - hipStream_t stream; - HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); - struct ncclChannel* channel = comm->channels; - uint64_t **p_dev_head = (uint64_t **)((uint8_t*)(channel->devPeers + channel->ring.next) + offsetof(struct ncclPeer, send[0].conn.head)); - uint64_t *real_head, *fake_head, *fake_h; - - // get original head - HIPCHECK(hipMemcpy(&real_head, p_dev_head, sizeof(uint64_t*), hipMemcpyDefault)); - // allocate and install fakes - HIPCHECK(hipHostMalloc(&fake_head, sizeof(uint64_t*), hipHostMallocMapped)); - HIPCHECK(hipMemcpy(p_dev_head, &fake_head, sizeof(uint64_t*), hipMemcpyDefault)); - *fake_head = 0; - // read back fakes to confirm - HIPCHECK(hipMemcpy(&fake_h, p_dev_head, sizeof(uint64_t*), hipMemcpyDefault)); - //std::cerr << "[ ] replaced gpu " << gpu << " real_opCount = " << real_opCount << " to fake_opCount = " << fake_o << std::endl; - //std::cerr << "[ ] replaced gpu " << gpu << " real_head = " << real_head << " to fake_head = " << fake_h << std::endl; - - // Perform a number of iterations and introduce abort - for (int j = 0; j < NUM_ITER; j++) { - //std::cerr << "[ ] iter = " << j << std::endl; - // Start a group call - ncclGroupStart(); - for (int i = 0; i < numDevices; i++) { - ncclAllReduce(dataset.inputs[i], dataset.outputs[i], - numElements, dataType, op, comms[i], streams[i]); - } - // Signal end of group call - ncclGroupEnd(); - } - - // Wait for reduction to complete - auto start = std::chrono::high_resolution_clock::now(); - hipError_t hipErr; - int remaining = numDevices; - int* done = (int*)malloc(sizeof(int)*numDevices); - memset(done, 0, sizeof(int)*numDevices); - bool timeout = false, abort_called = false; - while (remaining) { - int idle = 1; - for (int i=0; i= 2 - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) - auto delta = std::chrono::high_resolution_clock::now() - start; - double deltaSec = std::chrono::duration_cast>(delta).count(); - if (deltaSec > 10.0 && !timeout) { - std::cerr << "[ ] timeout condition, calling ncclCommAbort ... " << std::endl; - timeout = true; - } - ncclResult_t ncclAsyncErr; - ncclCommGetAsyncError(comms[i], &ncclAsyncErr); - if ((ncclAsyncErr != ncclSuccess || timeout) && !abort_called) { - // An asynchronous error happened. Stop the operation and destroy - // the communicator - std::cerr << "[ ] ncclAsyncErr = " << ncclAsyncErr << std::endl; - for (int i=0; i numDevicesAvailable) return; - - // Prepare input / output / expected results - Dataset dataset1, dataset2, dataset3; - dataset1.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce); - dataset2.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce); - dataset3.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce); - FillDatasetWithPattern(dataset1); - FillDatasetWithPattern(dataset2); - FillDatasetWithPattern(dataset3); - ComputeExpectedResults(dataset1, op); - ComputeExpectedResults(dataset2, op); - ComputeExpectedResults(dataset3, op); - - // Launch the reduction (1 thread per GPU) - ncclGroupStart(); - for (int i = 0; i < numDevices; i++) - { - ncclAllReduce(dataset1.inputs[i], dataset1.outputs[i], numElements, dataType, op, comms[i], streams[i]); - ncclAllReduce(dataset2.inputs[i], dataset2.outputs[i], numElements, dataType, op, comms[i], streams[i]); - ncclAllReduce(dataset3.inputs[i], dataset3.outputs[i], numElements, dataType, op, comms[i], streams[i]); - } - ncclGroupEnd(); - - // Wait for reduction to complete - Synchronize(); - - // Check results - ValidateResults(dataset1); - ValidateResults(dataset2); - ValidateResults(dataset3); - - dataset1.Release(); - dataset2.Release(); - dataset3.Release(); - } -#if defined(BUILD_ALLREDUCE_ONLY) - INSTANTIATE_TEST_SUITE_P(AllReduceGroupCorrectnessSweep, - AllReduceGroupCorrectnessTest, - testing::Combine( - // Reduction operator - testing::Values(ncclSum), - // Data types - testing::Values(ncclFloat32), - // Number of elements - testing::Values(1024, 1048576), - // Number of devices - testing::Range(2,(GTESTS_NUM_GPUS+1)), - // In-place or not - testing::Values(false, true), - testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")), - CorrectnessTest::PrintToStringParamName()); -#else - INSTANTIATE_TEST_SUITE_P(AllReduceGroupCorrectnessSweep, - AllReduceGroupCorrectnessTest, - testing::Combine( - // Reduction operator - testing::Values(ncclSum), - // Data types - testing::Values(ncclFloat32, ncclFloat64), - // Number of elements - testing::Values(1024, 1048576), - // Number of devices - testing::Range(2,(GTESTS_NUM_GPUS+1)), - // In-place or not - testing::Values(false, true), - testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")), - CorrectnessTest::PrintToStringParamName()); -#endif -} // namespace diff --git a/test/test_AllReduceGroup.hpp b/test/test_AllReduceGroup.hpp deleted file mode 100644 index e21da66dd2..0000000000 --- a/test/test_AllReduceGroup.hpp +++ /dev/null @@ -1,79 +0,0 @@ -/************************************************************************* - * Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_ALLREDUCEGROUP_HPP -#define TEST_ALLREDUCEGROUP_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class AllReduceGroupCorrectnessTest : public CorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op) - { - // Copy all inputs to expected arrays temporarily to perform reduction on host - for (int i = 0; i < dataset.numDevices; i++) - HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i], - dataset.NumBytes(), hipMemcpyDeviceToHost)); - - // Allocate temporary host array to accumulate results - int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes()); - uint8_t* resultU1 = (uint8_t *)resultI1; - int32_t* resultI4 = (int32_t *)resultI1; - uint32_t* resultU4 = (uint32_t *)resultI1; - int64_t* resultI8 = (int64_t *)resultI1; - uint64_t* resultU8 = (uint64_t *)resultI1; - float* resultF4 = (float *)resultI1; - double* resultF8 = (double *)resultI1; - rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1; - - // Initialize the result with the first device's array - memcpy(resultI1, dataset.expected[0], dataset.NumBytes()); - - // Perform reduction on the other device arrays - for (int i = 1; i < dataset.numDevices; i++) - { - int8_t* arrayI1 = (int8_t *)dataset.expected[i]; - uint8_t* arrayU1 = (uint8_t *)arrayI1; - int32_t* arrayI4 = (int32_t *)arrayI1; - uint32_t* arrayU4 = (uint32_t *)arrayI1; - int64_t* arrayI8 = (int64_t *)arrayI1; - uint64_t* arrayU8 = (uint64_t *)arrayI1; - float* arrayF4 = (float *)arrayI1; - double* arrayF8 = (double *)arrayI1; - rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1; - - for (int j = 0; j < dataset.numElements; j++) - { - switch (dataset.dataType) - { - case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break; - case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break; - case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break; - case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break; - case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break; - case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break; - case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break; - case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break; - case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break; - default: - fprintf(stderr, "[ERROR] Unsupported datatype\n"); - exit(0); - } - } - } - - // Copy results into expected arrays - for (int i = 0; i < dataset.numDevices; i++) - memcpy(dataset.expected[i], resultI1, dataset.NumBytes()); - - free(resultI1); - } - }; -} - -#endif diff --git a/test/test_AllReduceGroupMultiProcess.cpp b/test/test_AllReduceGroupMultiProcess.cpp deleted file mode 100644 index 270f81a03d..0000000000 --- a/test/test_AllReduceGroupMultiProcess.cpp +++ /dev/null @@ -1,82 +0,0 @@ -/************************************************************************* - * Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#include "test_AllReduceGroupMultiProcess.hpp" - -namespace CorrectnessTests -{ - TEST_P(AllReduceGroupMultiProcessCorrectnessTest, Correctness) - { - // Important: Make sure the order of ncclFunc_t's here match the order of ncclFunc_ts - // as they appear in TestGroupCalls() - std::vector ncclFuncs; - ncclFuncs.push_back(ncclCollAllReduce); - ncclFuncs.push_back(ncclCollAllReduce); - ncclFuncs.push_back(ncclCollAllReduce); - - // Create multiple datasets for combined operation - std::vector datasets(ncclFuncs.size()); - for (int i = 0; i < datasets.size(); i++) - { - datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); - datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]); - } - - int const numGpusPerProcess = 2; - int const numProcesses = numDevices / numGpusPerProcess; - std::vector pids(numProcesses); - int process = -1; - - for (int i = 0; i < numDevices; i+= numGpusPerProcess) - { - process++; - int pid = fork(); - if (pid == 0) - { - int gpuIdx = i; - int maxIdx = gpuIdx + (numGpusPerProcess - 1) >= numDevices ? numDevices : gpuIdx + numGpusPerProcess; - - std::vector ranks; - for (; gpuIdx < maxIdx; gpuIdx++) - { - ranks.push_back(gpuIdx); - } - - bool pass; - TestGroupCalls(process, ranks, datasets, ncclFuncs, pass); - TerminateChildProcess(pass); - } - else - { - pids[process] = pid; - } - } - - ValidateProcesses(pids); - - for (int i = 0; i < datasets.size(); i++) - { - datasets[i]->ReleaseRootProcess(); - munmap(datasets[i], sizeof(Dataset)); - } - } - - INSTANTIATE_TEST_SUITE_P(AllReduceGroupMultiProcessCorrectnessSweep, - AllReduceGroupMultiProcessCorrectnessTest, - testing::Combine( - // Reduction operator (not used) - testing::Values(ncclSum), - // Data types - testing::Values(ncclFloat32, - ncclFloat64), - // Number of elements - testing::Values(3072, 3145728), - // Number of devices - testing::Values(4,8), - // In-place or not - testing::Values(false, true), - testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_AllReduceGroupMultiProcess.hpp b/test/test_AllReduceGroupMultiProcess.hpp deleted file mode 100644 index 167d8a525a..0000000000 --- a/test/test_AllReduceGroupMultiProcess.hpp +++ /dev/null @@ -1,105 +0,0 @@ -/************************************************************************* - * Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef TEST_ALLREDUCEGROUP_MULTI_PROCESS_HPP -#define TEST_ALLREDUCEGROUP_MULTI_PROCESS_HPP - -#include "CorrectnessTest.hpp" -#include "test_AllReduceMultiProcess.hpp" -#include - -namespace CorrectnessTests -{ - class AllReduceGroupMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest - { - public: - void TestGroupCalls(int process, std::vector const& ranks, std::vector& datasets, std::vector const& funcs, bool& pass) - { - ncclGroupStart(); - for (int i = 0; i < ranks.size(); i++) - { - SetUpPerProcess(ranks[i], funcs, comms[ranks[i]], streams[ranks[i]], datasets); - if (numDevices > numDevicesAvailable) - { - break; - } - } - ncclGroupEnd(); - - if (numDevices > numDevicesAvailable) - { - pass = true; - return; - } - - int numProcesses = numDevices / ranks.size(); - Barrier barrier(process, numProcesses, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID")))); - - for (int i = 0; i < ranks.size(); i++) - { - for (int j = 0; j < datasets.size(); j++) - { - FillDatasetWithPattern(*datasets[j], ranks[i]); - } - } - - int const root = 0; - - for (int i = 0; i < 3; i++) - { - AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[i], barrier, op, ranks); - } - barrier.Wait(); - - size_t const byteCount = datasets[0]->NumBytes() / numDevices; - size_t const elemCount = numElements / numDevices; - - ncclGroupStart(); - // AllReduce - for (int i = 0; i < ranks.size(); i++) - { - int rank = ranks[i]; - for (int j = 0; j < 3; j++) - { - ncclAllReduce(datasets[j]->inputs[rank], datasets[j]->outputs[rank], - numElements, dataType, op, comms[rank], streams[rank]); - } - } - // Signal end of group call - ncclGroupEnd(); - - for (int i = 0; i < ranks.size(); i++) - { - HIP_CALL(hipSetDevice(ranks[i])); - HIP_CALL(hipStreamSynchronize(streams[ranks[i]])); - } - - for (int i = 0; i < funcs.size(); i++) - { - for (int j = 0; j < ranks.size(); j++) - { - pass = ValidateResults(*datasets[i], ranks[j], root); - if (!pass) - { - break; - } - } - barrier.Wait(); - for (int j = 0; j < ranks.size(); j++) - { - datasets[i]->Release(ranks[j]); - } - } - - for (int i = 0; i < ranks.size(); i++) - { - TearDownPerProcess(comms[ranks[i]], streams[ranks[i]]); - } - } - }; -} - -#endif diff --git a/test/test_AllReduceMultiProcess.cpp b/test/test_AllReduceMultiProcess.cpp deleted file mode 100644 index 5a73b630f1..0000000000 --- a/test/test_AllReduceMultiProcess.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "test_AllReduceMultiProcess.hpp" - -namespace CorrectnessTests -{ - TEST_P(AllReduceMultiProcessCorrectnessTest, Correctness) - { - dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllReduce); - std::vector pids(numDevices); - - int gpu = -1; - for (int i = 0; i < numDevices; i++) - { - gpu++; - int pid = fork(); - if (pid == 0) - { - bool pass; - TestAllReduce(gpu, *dataset, pass); - TerminateChildProcess(pass); - } - else - { - pids[gpu] = pid; - } - } - - ValidateProcesses(pids); - dataset->ReleaseRootProcess(); - } - - INSTANTIATE_TEST_SUITE_P(AllReduceMultiProcessCorrectnessSweep, - AllReduceMultiProcessCorrectnessTest, - testing::Combine( - // Reduction operator - testing::Values(ncclSum, ncclProd, ncclMax, ncclMin), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(1024, 1048576), - // Number of devices - testing::Values(2,3,4,8), - // In-place or not - testing::Values(false, true), - testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_AllReduceMultiProcess.hpp b/test/test_AllReduceMultiProcess.hpp deleted file mode 100644 index 41ef265cd1..0000000000 --- a/test/test_AllReduceMultiProcess.hpp +++ /dev/null @@ -1,117 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_ALLREDUCE_MULTI_PROCESS_HPP -#define TEST_ALLREDUCE_MULTI_PROCESS_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class AllReduceMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, std::vector const& ranks) - { - // Copy all inputs to expected arrays temporarily to perform reduction on host - for (int i = 0; i < ranks.size(); i++) - { - int rank = ranks[i]; - HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank], - dataset.NumBytes(), hipMemcpyDeviceToHost)); - } - barrier.Wait(); - - // Allocate temporary host array to accumulate results - int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes()); - uint8_t* resultU1 = (uint8_t *)resultI1; - int32_t* resultI4 = (int32_t *)resultI1; - uint32_t* resultU4 = (uint32_t *)resultI1; - int64_t* resultI8 = (int64_t *)resultI1; - uint64_t* resultU8 = (uint64_t *)resultI1; - float* resultF4 = (float *)resultI1; - double* resultF8 = (double *)resultI1; - rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1; - - // Initialize the result with the first device's array - memcpy(resultI1, dataset.expected[0], dataset.NumBytes()); - barrier.Wait(); - - // Perform reduction - for (int i = 1; i < dataset.numDevices; i++) - { - int8_t* arrayI1 = (int8_t *)dataset.expected[i]; - uint8_t* arrayU1 = (uint8_t *)arrayI1; - int32_t* arrayI4 = (int32_t *)arrayI1; - uint32_t* arrayU4 = (uint32_t *)arrayI1; - int64_t* arrayI8 = (int64_t *)arrayI1; - uint64_t* arrayU8 = (uint64_t *)arrayI1; - float* arrayF4 = (float *)arrayI1; - double* arrayF8 = (double *)arrayI1; - rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1; - - for (int j = 0; j < dataset.numElements; j++) - { - switch (dataset.dataType) - { - case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break; - case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break; - case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break; - case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break; - case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break; - case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break; - case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break; - case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break; - case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break; - default: - fprintf(stderr, "[ERROR] Unsupported datatype\n"); - exit(0); - } - } - } - barrier.Wait(); - - // Copy results into expected array - for (int i = 0; i < ranks.size(); i++) - { - int rank = ranks[i]; - memcpy(dataset.expected[rank], resultI1, dataset.NumBytes()); - } - free(resultI1); - } - - void TestAllReduce(int rank, Dataset& dataset, bool& pass) - { - SetUpPerProcess(rank, ncclCollAllReduce, comms[rank], streams[rank], dataset); - - if (numDevices > numDevicesAvailable) - { - pass = true; - return; - } - - Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID")))); - - // Prepare input / output / expected results - FillDatasetWithPattern(dataset, rank); - ComputeExpectedResults(dataset, barrier, op, std::vector(1, rank)); - - // Launch the reduction - ncclAllReduce(dataset.inputs[rank], dataset.outputs[rank], - numElements, dataType, op, comms[rank], streams[rank]); - - // Wait for reduction to complete - HIP_CALL(hipStreamSynchronize(streams[rank])); - - // Check results - pass = ValidateResults(dataset, rank); - - TearDownPerProcess(comms[rank], streams[rank]); - dataset.Release(rank); - } - }; -} - -#endif diff --git a/test/test_AllToAll.cpp b/test/test_AllToAll.cpp deleted file mode 100644 index 8997a319c5..0000000000 --- a/test/test_AllToAll.cpp +++ /dev/null @@ -1,67 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "test_AllToAll.hpp" - -namespace CorrectnessTests -{ - TEST_P(AllToAllCorrectnessTest, Correctness) - { - if (numDevices > numDevicesAvailable) return; - - // Allocate data - Dataset dataset; - dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllToAll); - - // Prepare input / output / expected results - FillDatasetWithPattern(dataset); - ComputeExpectedResults(dataset); - - // Launch the reduction (1 thread per GPU) - ncclGroupStart(); - for (int i = 0; i < numDevices; i++) - { - ncclAllToAll(dataset.inputs[i], - dataset.outputs[i], - numElements, dataType, - comms[i], streams[i]); - } - ncclGroupEnd(); - - // Wait for reduction to complete - Synchronize(); - - // Check results - ValidateResults(dataset); - - dataset.Release(); - } - - INSTANTIATE_TEST_SUITE_P(AllToAllCorrectnessSweep, - AllToAllCorrectnessTest, - testing::Combine( - // Reduction operator is not used - testing::Values(ncclSum), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(1024, 1048576), - // Number of devices - testing::Range(2,(GTESTS_NUM_GPUS+1)), - // In-place or not - testing::Values(false), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_AllToAll.hpp b/test/test_AllToAll.hpp deleted file mode 100644 index 104daba05d..0000000000 --- a/test/test_AllToAll.hpp +++ /dev/null @@ -1,26 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_ALLTOALL_HPP -#define TEST_ALLTOALL_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class AllToAllCorrectnessTest : public CorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset) - { - for (int i = 0; i < dataset.numDevices; i++) - for (int j = 0; j < dataset.numDevices; j++) - HIP_CALL(hipMemcpy((int8_t *)dataset.expected[i]+dataset.NumBytes()*j, (int8_t *)dataset.inputs[j]+dataset.NumBytes()*i, - dataset.NumBytes(), hipMemcpyDeviceToHost)); - } - }; -} - -#endif diff --git a/test/test_AllToAllMultiProcess.cpp b/test/test_AllToAllMultiProcess.cpp deleted file mode 100644 index 33cc5c98b4..0000000000 --- a/test/test_AllToAllMultiProcess.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "test_AllToAllMultiProcess.hpp" - -namespace CorrectnessTests -{ - TEST_P(AllToAllMultiProcessCorrectnessTest, Correctness) - { - dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllToAll); - std::vector pids(numDevices); - - int gpu = -1; - for (int i = 0; i < numDevices; i++) - { - gpu++; - int pid = fork(); - if (pid == 0) - { - bool pass; - TestAllToAll(gpu, *dataset, pass); - TerminateChildProcess(pass); - } - else - { - pids[gpu] = pid; - } - } - - ValidateProcesses(pids); - dataset->ReleaseRootProcess(); - } - - INSTANTIATE_TEST_SUITE_P(AllToAllMultiProcessCorrectnessSweep, - AllToAllMultiProcessCorrectnessTest, - testing::Combine( - // Reduction operator is not used - testing::Values(ncclSum), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(1024, 1048576), - // Number of devices - testing::Values(2,3,4,8), - // In-place or not - testing::Values(false), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_AllToAllMultiProcess.hpp b/test/test_AllToAllMultiProcess.hpp deleted file mode 100644 index 76205608c9..0000000000 --- a/test/test_AllToAllMultiProcess.hpp +++ /dev/null @@ -1,61 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_ALLTOALL_MULTI_PROCESS_HPP -#define TEST_ALLTOALL_MULTI_PROCESS_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class AllToAllMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset, std::vector const& ranks) - { - for (int i = 0; i < ranks.size(); i++) - { - int rank = ranks[i]; - for (int j = 0; j < dataset.numDevices; j++) - { - HIP_CALL(hipMemcpy((int8_t *)dataset.expected[j]+dataset.NumBytes()*rank, (int8_t *)dataset.inputs[rank]+dataset.NumBytes()*j, - dataset.NumBytes(), hipMemcpyDeviceToHost)); - } - } - } - - void TestAllToAll(int rank, Dataset& dataset, bool& pass) - { - SetUpPerProcess(rank, ncclCollAllToAll, comms[rank], streams[rank], dataset); - - if (numDevices > numDevicesAvailable) - { - pass = true; - return; - } - - // Prepare input / output / expected results - FillDatasetWithPattern(dataset, rank); - ComputeExpectedResults(dataset, std::vector(1, rank)); - - // Launch the reduction - ncclAllToAll(dataset.inputs[rank], - dataset.outputs[rank], - numElements, dataType, - comms[rank], streams[rank]); - - // Wait for reduction to complete - HIP_CALL(hipStreamSynchronize(streams[rank])); - - // Check results - pass = ValidateResults(dataset, rank); - - TearDownPerProcess(comms[rank], streams[rank]); - dataset.Release(rank); - } - }; -} - -#endif diff --git a/test/test_AllToAllv.cpp b/test/test_AllToAllv.cpp deleted file mode 100644 index cb303c3679..0000000000 --- a/test/test_AllToAllv.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "test_AllToAllv.hpp" - -namespace CorrectnessTests -{ - TEST_P(AllToAllvCorrectnessTest, Correctness) - { - if (numDevices > numDevicesAvailable) return; - - // Allocate data - Dataset dataset; - dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllToAll); - - // Prepare input / output / expected results - FillDatasetWithPattern(dataset); - ComputeExpectedResults(dataset); - - size_t chunksize = numElements*2/numDevices; - #define MAX_ALLTOALLV_RANKS 16 - static size_t sendcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS]; - // Launch the reduction (1 thread per GPU) - ncclGroupStart(); - for (int r = 0; r < numDevices; r++) { - size_t disp = 0; - for (int i = 0; i < numDevices; i++) { - size_t scount = ((i+r)%numDevices)*chunksize; - if (i+r == numDevices-1) - scount += (numElements*numDevices-chunksize*(numDevices-1)*numDevices/2); - sendcounts[i+r*MAX_ALLTOALLV_RANKS] = recvcounts[i+r*MAX_ALLTOALLV_RANKS] = scount; - sdispls[i+r*MAX_ALLTOALLV_RANKS] = rdispls[i+r*MAX_ALLTOALLV_RANKS] = disp; - disp += scount; - } - ncclAllToAllv((char*)dataset.inputs[r], sendcounts+r*MAX_ALLTOALLV_RANKS, sdispls+r*MAX_ALLTOALLV_RANKS, - (char*)dataset.outputs[r], recvcounts+r*MAX_ALLTOALLV_RANKS, rdispls+r*MAX_ALLTOALLV_RANKS, dataType, comms[r], streams[r]); - } - ncclGroupEnd(); - // Wait for reduction to complete - Synchronize(); - - // Check results - ValidateResults(dataset); - - dataset.Release(); - } - - INSTANTIATE_TEST_SUITE_P(AllToAllvCorrectnessSweep, - AllToAllvCorrectnessTest, - testing::Combine( - // Reduction operator is not used - testing::Values(ncclSum), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(2520, 3026520), - // Number of devices - testing::Range(2,(GTESTS_NUM_GPUS+1)), - // In-place or not - testing::Values(false), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_AllToAllv.hpp b/test/test_AllToAllv.hpp deleted file mode 100644 index e5ebfb1955..0000000000 --- a/test/test_AllToAllv.hpp +++ /dev/null @@ -1,44 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_ALLTOALLV_HPP -#define TEST_ALLTOALLV_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class AllToAllvCorrectnessTest : public CorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset) - { - for (int i = 0; i < dataset.numDevices; i++) { - size_t rdisp = 0; - size_t chunksize = dataset.numElements*2/dataset.numDevices; - for (int j = 0; j < dataset.numDevices; j++) { - size_t scount = 0, rcount = ((j+i)%dataset.numDevices)*chunksize; - if (j+i == dataset.numDevices-1) - rcount += (dataset.numElements*dataset.numDevices-chunksize*(dataset.numDevices-1)*dataset.numDevices/2); - size_t sdisp = 0; - for (int k=0; k numDevicesAvailable) return; - - // Allocate data - Dataset dataset; - dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollBroadcast); - - // Test each possible root - for (int root = 0; root < numDevices; root++) - { - // Prepare input / output / expected results - FillDatasetWithPattern(dataset); - ComputeExpectedResults(dataset, root); - - // Launch the reduction (1 thread per GPU) - ncclGroupStart(); - for (int i = 0; i < numDevices; i++) - { - ncclBroadcast(dataset.inputs[i], - dataset.outputs[i], - numElements, dataType, - root, comms[i], streams[i]); - } - ncclGroupEnd(); - - // Wait for reduction to complete - Synchronize(); - - // Check results - ValidateResults(dataset); - } - - dataset.Release(); - } - - INSTANTIATE_TEST_SUITE_P(BroadcastCorrectnessSweep, - BroadcastCorrectnessTest, - testing::Combine( - // Reduction operator is not used - testing::Values(ncclSum), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(1024, 1048576), - // Number of devices - testing::Range(2,(GTESTS_NUM_GPUS+1)), - // In-place or not - testing::Values(false, true), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_Broadcast.hpp b/test/test_Broadcast.hpp deleted file mode 100644 index 9ff24ee104..0000000000 --- a/test/test_Broadcast.hpp +++ /dev/null @@ -1,25 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_BROADCAST_HPP -#define TEST_BROADCAST_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class BroadcastCorrectnessTest : public CorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset, int const root) - { - for (int i = 0; i < dataset.numDevices; i++) - HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[root], - dataset.NumBytes(), hipMemcpyDeviceToHost)); - } - }; -} - -#endif diff --git a/test/test_BroadcastAbort.cpp b/test/test_BroadcastAbort.cpp deleted file mode 100644 index 2f0d46a147..0000000000 --- a/test/test_BroadcastAbort.cpp +++ /dev/null @@ -1,140 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "test_BroadcastAbort.hpp" -#include "../include/comm.h" - -#define NUM_ITER 8 -#define FAKE_OP_COUNT NUM_ITER+1 - -namespace CorrectnessTests -{ - #define HIPCHECK(cmd) \ - do { \ - hipError_t error = (cmd); \ - if (error != hipSuccess) { \ - std::cerr << "Encountered HIP error (" << error << ") at line " \ - << __LINE__ << " in file " << __FILE__ << "\n"; \ - exit(-1); \ - } \ - } while (0) - - #define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST) - #define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST) - - TEST_P(BroadcastAbortTest, Correctness) { - if (numDevices > numDevicesAvailable) return; - - // Prepare input / output / expected results - Dataset dataset; - dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollBroadcast); - FillDatasetWithPattern(dataset); - - int root = 0; - int gpu = 0; // GPU number to trigger abort - ncclComm_t comm = comms[gpu]; - - HIPCHECK(hipSetDevice(gpu)); - hipStream_t stream; - HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); - struct ncclChannel* channel = comm->channels; - uint64_t **p_dev_head = (uint64_t **)((uint8_t*)(channel->devPeers + channel->ring.next) + offsetof(struct ncclPeer, send[0].conn.head)); - uint64_t *real_head, *fake_head, *fake_h; - - // get original head - HIPCHECK(hipMemcpy(&real_head, p_dev_head, sizeof(uint64_t*), hipMemcpyDefault)); - // allocate and install fakes - HIPCHECK(hipHostMalloc(&fake_head, sizeof(uint64_t*), hipHostMallocMapped)); - HIPCHECK(hipMemcpy(p_dev_head, &fake_head, sizeof(uint64_t*), hipMemcpyDefault)); - *fake_head = 0; - // read back fakes to confirm - HIPCHECK(hipMemcpy(&fake_h, p_dev_head, sizeof(uint64_t*), hipMemcpyDefault)); - //std::cerr << "[ ] replaced gpu " << gpu << " real_head = " << real_head << " to fake_head = " << fake_h << std::endl; - - // Perform a number of iterations and introduce abort - for (int j = 0; j < NUM_ITER; j++) { - //std::cerr << "[ ] iter = " << j << std::endl; - // Start a group call - ncclGroupStart(); - for (int i = 0; i < numDevices; i++) { - ncclBroadcast(dataset.inputs[i], - dataset.outputs[i], - numElements, dataType, - root, comms[i], streams[i]); - } - // Signal end of group call - ncclGroupEnd(); - } - - // Wait for reduction to complete - auto start = std::chrono::high_resolution_clock::now(); - hipError_t hipErr; - int remaining = numDevices; - int* done = (int*)malloc(sizeof(int)*numDevices); - memset(done, 0, sizeof(int)*numDevices); - bool timeout = false, abort_called = false; - while (remaining) { - int idle = 1; - for (int i=0; i= 2 - #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) - auto delta = std::chrono::high_resolution_clock::now() - start; - double deltaSec = std::chrono::duration_cast>(delta).count(); - if (deltaSec > 10.0 && !timeout) { - std::cerr << "[ ] timeout condition, calling ncclCommAbort ... " << std::endl; - timeout = true; - } - ncclResult_t ncclAsyncErr; - ncclCommGetAsyncError(comms[i], &ncclAsyncErr); - if ((ncclAsyncErr != ncclSuccess || timeout) && !abort_called) { - // An asynchronous error happened. Stop the operation and destroy - // the communicator - std::cerr << "[ ] ncclAsyncErr = " << ncclAsyncErr << std::endl; - for (int i=0; i -#include -#include -#include -#include -#include - -namespace CorrectnessTests -{ - TEST_P(BroadcastMultiProcessCorrectnessTest, Correctness) - { - dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollBroadcast); - std::vector pids(numDevices); - - int gpu = -1; - for (int i = 0; i < numDevices; i++) - { - gpu++; - int pid = fork(); - if (pid == 0) - { - bool pass; - TestBroadcast(gpu, *dataset, pass); - TerminateChildProcess(pass); - } - else - { - pids[gpu] = pid; - } - } - - ValidateProcesses(pids); - dataset->ReleaseRootProcess(); - } - - INSTANTIATE_TEST_SUITE_P(BroadcastMultiProcessCorrectnessSweep, - BroadcastMultiProcessCorrectnessTest, - testing::Combine( - // Reduction operator is not used - testing::Values(ncclSum), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(1024, 1048576), - // Number of devices - testing::Values(2,3,4,8), - // In-place or not - testing::Values(false, true), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_BroadcastMultiProcess.hpp b/test/test_BroadcastMultiProcess.hpp deleted file mode 100644 index 7335c15985..0000000000 --- a/test/test_BroadcastMultiProcess.hpp +++ /dev/null @@ -1,77 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_BROADCAST_MULTI_PROCESS_HPP -#define TEST_BROADCAST_MULTI_PROCESS_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class BroadcastMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset, int const root, std::vector const& ranks) - { - for (int h = 0; h < ranks.size(); h++) - { - int rank = ranks[h]; - // Root has the answer; share it via host memcpy's - if (rank == root) - { - HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank], - dataset.NumBytes(), hipMemcpyDeviceToHost)); - for (int i = 0; i < dataset.numDevices; i++) - { - if (i == rank) continue; - memcpy(dataset.expected[i], dataset.expected[root], dataset.NumBytes()); - } - break; - } - } - } - - void TestBroadcast(int rank, Dataset& dataset, bool& pass) - { - SetUpPerProcess(rank, ncclCollBroadcast, comms[rank], streams[rank], dataset); - - if (numDevices > numDevicesAvailable) - { - pass = true; - return; - } - - Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID")))); - - // Test each possible root - for (int root = 0; root < numDevices; root++) - { - // Prepare input / output / expected results - FillDatasetWithPattern(dataset, rank); - ComputeExpectedResults(dataset, root, std::vector(1, rank)); - - // Launch the reduction (1 process per GPU) - ncclResult_t res = ncclBroadcast(dataset.inputs[rank], - dataset.outputs[rank], - numElements, dataType, - root, comms[rank], streams[rank]); - - // Wait for reduction to complete - HIP_CALL(hipStreamSynchronize(streams[rank])); - - // Check results - pass = ValidateResults(dataset, rank); - - // Ensure all processes have finished current iteration before proceeding - barrier.Wait(); - } - - TearDownPerProcess(comms[rank], streams[rank]); - dataset.Release(rank); - } - }; -} - -#endif diff --git a/test/test_CombinedCalls.cpp b/test/test_CombinedCalls.cpp deleted file mode 100644 index b951470606..0000000000 --- a/test/test_CombinedCalls.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#include "test_CombinedCalls.hpp" - -#include "test_AllGather.hpp" -#include "test_AllReduce.hpp" -#include "test_Broadcast.hpp" -#include "test_Reduce.hpp" -#include "test_ReduceScatter.hpp" -#include "test_Scatter.hpp" - -namespace CorrectnessTests -{ - TEST_P(CombinedCallsCorrectnessTest, Correctness) - { - if (numDevices > numDevicesAvailable) return; - - // Create multiple datasets for combined operation - std::vector datasets(5); - std::vector ncclFuncs(5); - ncclFuncs.push_back(ncclCollAllGather); - ncclFuncs.push_back(ncclCollAllReduce); - ncclFuncs.push_back(ncclCollBroadcast); - ncclFuncs.push_back(ncclCollReduce); - ncclFuncs.push_back(ncclCollReduceScatter); - - // Adjust numElements to be multiple of numDevices - numElements = (numElements/numDevices)*numDevices; - for (int i = 0; i < datasets.size(); i++) - { - datasets[i].Initialize(numDevices, numElements, dataType, inPlace, ncclFuncs[i]); - FillDatasetWithPattern(datasets[i]); - } - - Dataset scatter_dataset; - scatter_dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollScatter); - FillDatasetWithPattern(scatter_dataset); - - // Compute expected results for each dataset in combined - int const root = 0; - AllGatherCorrectnessTest::ComputeExpectedResults(datasets[0]); - AllReduceCorrectnessTest::ComputeExpectedResults(datasets[1], op); - BroadcastCorrectnessTest::ComputeExpectedResults(datasets[2], root); - ReduceCorrectnessTest::ComputeExpectedResults(datasets[3], op, root); - ReduceScatterCorrectnessTest::ComputeExpectedResults(datasets[4], op); - ScatterCorrectnessTest::ComputeExpectedResults(scatter_dataset, root); - - size_t const byteCount = datasets[0].NumBytes() / numDevices; - size_t const elemCount = numElements / numDevices; - - for (int j = 0; j < 10; j++) - { - ncclGroupStart(); - for (int i = 0; i < numDevices; i++) - { - ncclScatter(scatter_dataset.inputs[i], - scatter_dataset.outputs[i], - numElements, dataType, - root, comms[i], streams[i]); - } - ncclGroupEnd(); - ncclGroupStart(); - for (int i = 0; i < numDevices; i++) - { - ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount), - datasets[0].outputs[i], elemCount, - dataType, comms[i], streams[i]); - - ncclAllReduce(datasets[1].inputs[i], datasets[1].outputs[i], - numElements, dataType, op, comms[i], streams[i]); - - ncclBroadcast(datasets[2].inputs[i], - datasets[2].outputs[i], - numElements, dataType, - root, comms[i], streams[i]); - - ncclReduce(datasets[3].inputs[i], - datasets[3].outputs[i], - numElements, dataType, op, - root, comms[i], streams[i]); - - ncclReduceScatter(datasets[4].inputs[i], - (int8_t *)datasets[4].outputs[i] + (i * byteCount), - elemCount, dataType, op, - comms[i], streams[i]); - } - ncclGroupEnd(); - // Wait for reduction to complete - Synchronize(); - // Check results for each collective in the combined - for (int i = 0; i < 5; i++) - ValidateResults(datasets[i]); - - ValidateResults(scatter_dataset); - } - - for (int i = 0; i < 5; i++) - datasets[i].Release(); - scatter_dataset.Release(); - } - - INSTANTIATE_TEST_SUITE_P(CombinedCallsCorrectnessSweep, - CombinedCallsCorrectnessTest, - testing::Combine( - // Reduction operator (not used) - testing::Values(ncclSum), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(2520, 3026520), - // Number of devices - testing::Range(2,(GTESTS_NUM_GPUS+1)), - // In-place or not - testing::Values(false), - testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1", "RCCL_P2P_NET_DISABLE=0", "RCCL_P2P_NET_DISABLE=1")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_CombinedCalls.hpp b/test/test_CombinedCalls.hpp deleted file mode 100644 index a19ea7c5df..0000000000 --- a/test/test_CombinedCalls.hpp +++ /dev/null @@ -1,17 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef TEST_COMBINEDCALLS_HPP -#define TEST_COMBINEDCALLS_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class CombinedCallsCorrectnessTest : public CorrectnessTest {}; -} - -#endif diff --git a/test/test_CombinedCallsMultiProcess.cpp b/test/test_CombinedCallsMultiProcess.cpp deleted file mode 100644 index e877ffe267..0000000000 --- a/test/test_CombinedCallsMultiProcess.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#include "test_CombinedCallsMultiProcess.hpp" - -namespace CorrectnessTests -{ - TEST_P(CombinedCallsMultiProcessCorrectnessTest, Correctness) - { - // Important: Make sure the order of ncclFunc_t's here match the order of ncclFunc_ts - // as they appear in TestCombinedCalls() - std::vector ncclFuncs; - ncclFuncs.push_back(ncclCollAllGather); - ncclFuncs.push_back(ncclCollAllReduce); - ncclFuncs.push_back(ncclCollBroadcast); - ncclFuncs.push_back(ncclCollReduce); - ncclFuncs.push_back(ncclCollReduceScatter); - - // Create multiple datasets for combined operation - std::vector datasets(ncclFuncs.size()); - for (int i = 0; i < datasets.size(); i++) - { - datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); - datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]); - } - - std::vector pids(numDevices); - - int gpu = -1; - for (int i = 0; i < numDevices; i++) - { - gpu++; - int pid = fork(); - if (pid == 0) - { - bool pass; - TestCombinedCalls(gpu, datasets, ncclFuncs, pass); - TerminateChildProcess(pass); - } - else - { - pids[gpu] = pid; - } - } - - ValidateProcesses(pids); - - for (int i = 0; i < datasets.size(); i++) - { - datasets[i]->ReleaseRootProcess(); - munmap(datasets[i], sizeof(Dataset)); - } - } - - INSTANTIATE_TEST_SUITE_P(CombinedCallsMultiProcessCorrectnessSweep, - CombinedCallsMultiProcessCorrectnessTest, - testing::Combine( - // Reduction operator (not used) - testing::Values(ncclSum), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(3072, 3145728), - // Number of devices - testing::Values(2,3,4,8), - // In-place or not - testing::Values(false, true), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_CombinedCallsMultiProcess.hpp b/test/test_CombinedCallsMultiProcess.hpp deleted file mode 100644 index 18501b6fa1..0000000000 --- a/test/test_CombinedCallsMultiProcess.hpp +++ /dev/null @@ -1,97 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef TEST_COMBINEDCALLS_MULTI_PROCESS_HPP -#define TEST_COMBINEDCALLS_MULTI_PROCESS_HPP - -#include "CorrectnessTest.hpp" - -#include "test_AllGatherMultiProcess.hpp" -#include "test_AllReduceMultiProcess.hpp" -#include "test_BroadcastMultiProcess.hpp" -#include "test_ReduceMultiProcess.hpp" -#include "test_ReduceScatterMultiProcess.hpp" - -namespace CorrectnessTests -{ - class CombinedCallsMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest - { - public: - void TestCombinedCalls(int rank, std::vector& datasets, std::vector const& funcs, bool& pass) - { - SetUpPerProcess(rank, funcs, comms[rank], streams[rank], datasets); - - if (numDevices > numDevicesAvailable) - { - pass = true; - return; - } - - Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID")))); - - // Compute expected results for each dataset in combined - int const root = 0; - std::vector ranks(1, rank); - AllGatherMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[0], barrier, numDevices, ranks); - AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[1], barrier, op, ranks); - BroadcastMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[2], root, ranks); - ReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[3], barrier, op, root, ranks); - ReduceScatterMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[4], barrier, op, ranks); - - size_t const byteCount = datasets[0]->NumBytes() / numDevices; - size_t const elemCount = numElements / numDevices; - - ncclAllGather((int8_t *)datasets[0]->inputs[rank] + (rank * byteCount), - datasets[0]->outputs[rank], elemCount, - dataType, comms[rank], streams[rank]); - - ncclAllReduce(datasets[1]->inputs[rank], datasets[1]->outputs[rank], - numElements, dataType, op, comms[rank], streams[rank]); - - ncclBroadcast(datasets[2]->inputs[rank], - datasets[2]->outputs[rank], - numElements, dataType, - root, comms[rank], streams[rank]); - - ncclReduce(datasets[3]->inputs[rank], - datasets[3]->outputs[rank], - numElements, dataType, op, - root, comms[rank], streams[rank]); - - ncclReduceScatter(datasets[4]->inputs[rank], - (int8_t *)datasets[4]->outputs[rank] + (rank * byteCount), - elemCount, dataType, op, - comms[rank], streams[rank]); - - // Wait for reduction to complete - HIP_CALL(hipStreamSynchronize(streams[rank])); - - for (int i = 0; i < funcs.size(); i++) - { - for (int j = 0; j < ranks.size(); j++) - { - pass = ValidateResults(*datasets[i], ranks[j], root); - if (!pass) - { - break; - } - } - barrier.Wait(); - for (int j = 0; j < ranks.size(); j++) - { - datasets[i]->Release(ranks[j]); - } - } - - for (int i = 0; i < ranks.size(); i++) - { - TearDownPerProcess(comms[ranks[i]], streams[ranks[i]]); - } - } - }; -} - -#endif diff --git a/test/test_Gather.cpp b/test/test_Gather.cpp deleted file mode 100644 index 8bf4edd6d5..0000000000 --- a/test/test_Gather.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "test_Gather.hpp" - -namespace CorrectnessTests -{ - TEST_P(GatherCorrectnessTest, Correctness) - { - if (numDevices > numDevicesAvailable) return; - - // Allocate data - Dataset dataset; - dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollGather); - - // Test each possible root - for (int root = 0; root < numDevices; root++) - { - // Prepare input / output / expected results - FillDatasetWithPattern(dataset); - ComputeExpectedResults(dataset, root); - - // Launch the reduction (1 thread per GPU) - ncclGroupStart(); - for (int i = 0; i < numDevices; i++) - { - ncclGather(dataset.inputs[i], - dataset.outputs[i], - numElements, dataType, - root, comms[i], streams[i]); - } - ncclGroupEnd(); - - // Wait for reduction to complete - Synchronize(); - - // Check results - ValidateResults(dataset, root); - } - - dataset.Release(); - } - - INSTANTIATE_TEST_SUITE_P(GatherCorrectnessSweep, - GatherCorrectnessTest, - testing::Combine( - // Reduction operator is not used - testing::Values(ncclSum), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(1024, 1048576), - // Number of devices - testing::Range(2,(GTESTS_NUM_GPUS+1)), - // In-place or not - testing::Values(false), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_Gather.hpp b/test/test_Gather.hpp deleted file mode 100644 index f75cfa9b2a..0000000000 --- a/test/test_Gather.hpp +++ /dev/null @@ -1,25 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_GATHER_HPP -#define TEST_GATHER_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class GatherCorrectnessTest : public CorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset, int const root) - { - for (int i = 0; i < dataset.numDevices; i++) - HIP_CALL(hipMemcpy((int8_t *)dataset.expected[root]+dataset.NumBytes()*i, dataset.inputs[i], - dataset.NumBytes(), hipMemcpyDeviceToHost)); - } - }; -} - -#endif diff --git a/test/test_GatherMultiProcess.cpp b/test/test_GatherMultiProcess.cpp deleted file mode 100644 index 02649072fb..0000000000 --- a/test/test_GatherMultiProcess.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "test_GatherMultiProcess.hpp" - -namespace CorrectnessTests -{ - TEST_P(GatherMultiProcessCorrectnessTest, Correctness) - { - dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollGather); - std::vector pids(numDevices); - - int gpu = -1; - for (int i = 0; i < numDevices; i++) - { - gpu++; - int pid = fork(); - if (pid == 0) - { - bool pass; - TestGather(gpu, *dataset, pass); - TerminateChildProcess(pass); - } - else - { - pids[gpu] = pid; - } - } - - ValidateProcesses(pids); - dataset->ReleaseRootProcess(); - } - - INSTANTIATE_TEST_SUITE_P(GatherMultiProcessCorrectnessSweep, - GatherMultiProcessCorrectnessTest, - testing::Combine( - // Reduction operator is not used - testing::Values(ncclSum), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(1024, 1048576), - // Number of devices - testing::Values(2,3,4,8), - // In-place or not - testing::Values(false), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_GatherMultiProcess.hpp b/test/test_GatherMultiProcess.hpp deleted file mode 100644 index ab022b052e..0000000000 --- a/test/test_GatherMultiProcess.hpp +++ /dev/null @@ -1,63 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_GATHER_MULTI_PROCESS_HPP -#define TEST_GATHER_MULTI_PROCESS_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class GatherMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset, int const root, int const rank) - { - HIP_CALL(hipMemcpy((int8_t *)dataset.expected[root]+dataset.NumBytes()*rank, dataset.inputs[rank], - dataset.NumBytes(), hipMemcpyDeviceToHost)); - } - - void TestGather(int rank, Dataset& dataset, bool& pass) - { - SetUpPerProcess(rank, ncclCollGather, comms[rank], streams[rank], dataset); - - if (numDevices > numDevicesAvailable) - { - pass = true; - return; - } - - Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID")))); - - // Test each possible root - for (int root = 0; root < numDevices; root++) - { - // Prepare input / output / expected results - FillDatasetWithPattern(dataset, rank); - ComputeExpectedResults(dataset, root, rank); - - // Launch the reduction (1 process per GPU) - ncclGather(dataset.inputs[rank], - dataset.outputs[rank], - numElements, dataType, - root, comms[rank], streams[rank]); - - // Wait for reduction to complete - HIP_CALL(hipStreamSynchronize(streams[rank])); - - // Check results - pass = ValidateResults(dataset, rank, root); - - // Ensure all processes have finished current iteration before proceeding - barrier.Wait(); - } - - TearDownPerProcess(comms[rank], streams[rank]); - dataset.Release(rank); - } - }; -} - -#endif diff --git a/test/test_GroupCalls.cpp b/test/test_GroupCalls.cpp deleted file mode 100644 index cb05ab6e5e..0000000000 --- a/test/test_GroupCalls.cpp +++ /dev/null @@ -1,130 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#include "test_GroupCalls.hpp" - -#include "test_AllGather.hpp" -#include "test_AllReduce.hpp" -#include "test_Broadcast.hpp" -#include "test_Reduce.hpp" -#include "test_ReduceScatter.hpp" - -namespace CorrectnessTests -{ - TEST_P(GroupCallsCorrectnessTest, Correctness) - { - if (numDevices > numDevicesAvailable) return; - - // Create multiple datasets for group operation - std::vector datasets(5); - std::vector ncclFuncs(5); - ncclFuncs.push_back(ncclCollAllGather); - ncclFuncs.push_back(ncclCollAllReduce); - ncclFuncs.push_back(ncclCollBroadcast); - ncclFuncs.push_back(ncclCollReduce); - ncclFuncs.push_back(ncclCollReduceScatter); - - // Adjust numElements to be multiple of numDevices - numElements = (numElements/numDevices)*numDevices; - for (int i = 0; i < datasets.size(); i++) - { - datasets[i].Initialize(numDevices, numElements, dataType, inPlace, ncclFuncs[i]); - FillDatasetWithPattern(datasets[i]); - } - - // Compute expected results for each dataset in group - int const root = 0; - AllGatherCorrectnessTest::ComputeExpectedResults(datasets[0]); - AllReduceCorrectnessTest::ComputeExpectedResults(datasets[1], op); - BroadcastCorrectnessTest::ComputeExpectedResults(datasets[2], root); - ReduceCorrectnessTest::ComputeExpectedResults(datasets[3], op, root); - ReduceScatterCorrectnessTest::ComputeExpectedResults(datasets[4], op); - - // Start a group call - ncclGroupStart(); - - // AllGather - size_t const byteCount = datasets[0].NumBytes() / numDevices; - size_t const elemCount = numElements / numDevices; - for (int i = 0; i < numDevices; i++) - { - ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount), - datasets[0].outputs[i], elemCount, - dataType, comms[i], streams[i]); - } - - // AllReduce - for (int i = 0; i < numDevices; i++) - { - ncclAllReduce(datasets[1].inputs[i], datasets[1].outputs[i], - numElements, dataType, op, comms[i], streams[i]); - } - - // Broadcast - for (int i = 0; i < numDevices; i++) - { - ncclBroadcast(datasets[2].inputs[i], - datasets[2].outputs[i], - numElements, dataType, - root, comms[i], streams[i]); - } - - // Reduce - for (int i = 0; i < numDevices; i++) - { - ncclReduce(datasets[3].inputs[i], - datasets[3].outputs[i], - numElements, dataType, op, - root, comms[i], streams[i]); - } - - // ReduceScatter - for (int i = 0; i < numDevices; i++) - { - ncclReduceScatter(datasets[4].inputs[i], - (int8_t *)datasets[4].outputs[i] + (i * byteCount), - elemCount, dataType, op, - comms[i], streams[i]); - } - - // Signal end of group call - ncclGroupEnd(); - - // Wait for reduction to complete - Synchronize(); - - // Check results for each collective in the group - for (int i = 0; i < 5; i++) - { - ValidateResults(datasets[i]); - datasets[i].Release(); - } - } - - INSTANTIATE_TEST_SUITE_P(GroupCallsCorrectnessSweep, - GroupCallsCorrectnessTest, - testing::Combine( - // Reduction operator (not used) - testing::Values(ncclSum), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(2520, 3026520), - // Number of devices - testing::Range(2,(GTESTS_NUM_GPUS+1)), - // In-place or not - testing::Values(false, true), - testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_GroupCalls.hpp b/test/test_GroupCalls.hpp deleted file mode 100644 index 7f595ee62d..0000000000 --- a/test/test_GroupCalls.hpp +++ /dev/null @@ -1,17 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef TEST_GROUPCALLS_HPP -#define TEST_GROUPCALLS_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class GroupCallsCorrectnessTest : public CorrectnessTest {}; -} - -#endif diff --git a/test/test_GroupCallsMultiProcess.cpp b/test/test_GroupCallsMultiProcess.cpp deleted file mode 100644 index 75377cbcb4..0000000000 --- a/test/test_GroupCallsMultiProcess.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#include "test_GroupCallsMultiProcess.hpp" - -namespace CorrectnessTests -{ - TEST_P(GroupCallsMultiProcessCorrectnessTest, Correctness) - { - // Important: Make sure the order of ncclFunc_t's here match the order of ncclFunc_ts - // as they appear in TestGroupCalls() - std::vector ncclFuncs; - ncclFuncs.push_back(ncclCollAllGather); - ncclFuncs.push_back(ncclCollAllReduce); - ncclFuncs.push_back(ncclCollBroadcast); - ncclFuncs.push_back(ncclCollReduce); - ncclFuncs.push_back(ncclCollReduceScatter); - - // Create multiple datasets for combined operation - std::vector datasets(ncclFuncs.size()); - for (int i = 0; i < datasets.size(); i++) - { - datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); - datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]); - } - - int const numGpusPerProcess = 2; - int const numProcesses = numDevices / numGpusPerProcess; - std::vector pids(numProcesses); - int process = -1; - - for (int i = 0; i < numDevices; i+= numGpusPerProcess) - { - process++; - int pid = fork(); - if (pid == 0) - { - int gpuIdx = i; - int maxIdx = gpuIdx + (numGpusPerProcess - 1) >= numDevices ? numDevices : gpuIdx + numGpusPerProcess; - - std::vector ranks; - for (; gpuIdx < maxIdx; gpuIdx++) - { - ranks.push_back(gpuIdx); - } - - bool pass; - TestGroupCalls(process, ranks, datasets, ncclFuncs, pass); - TerminateChildProcess(pass); - } - else - { - pids[process] = pid; - } - } - - ValidateProcesses(pids); - - for (int i = 0; i < datasets.size(); i++) - { - datasets[i]->ReleaseRootProcess(); - munmap(datasets[i], sizeof(Dataset)); - } - } - - INSTANTIATE_TEST_SUITE_P(GroupCallsMultiProcessCorrectnessSweep, - GroupCallsMultiProcessCorrectnessTest, - testing::Combine( - // Reduction operator (not used) - testing::Values(ncclSum), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(3072, 3145728), - // Number of devices - testing::Values(4,8), - // In-place or not - testing::Values(false, true), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_GroupCallsMultiProcess.hpp b/test/test_GroupCallsMultiProcess.hpp deleted file mode 100644 index 7eb7a58d27..0000000000 --- a/test/test_GroupCallsMultiProcess.hpp +++ /dev/null @@ -1,148 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef TEST_GROUPCALLS_MULTI_PROCESS_HPP -#define TEST_GROUPCALLS_MULTI_PROCESS_HPP - -#include "CorrectnessTest.hpp" -#include "test_AllGatherMultiProcess.hpp" -#include "test_AllReduceMultiProcess.hpp" -#include "test_BroadcastMultiProcess.hpp" -#include "test_ReduceMultiProcess.hpp" -#include "test_ReduceScatterMultiProcess.hpp" - -#include - -namespace CorrectnessTests -{ - class GroupCallsMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest - { - public: - void TestGroupCalls(int process, std::vector const& ranks, std::vector& datasets, std::vector const& funcs, bool& pass) - { - ncclGroupStart(); - for (int i = 0; i < ranks.size(); i++) - { - SetUpPerProcess(ranks[i], funcs, comms[ranks[i]], streams[ranks[i]], datasets); - if (numDevices > numDevicesAvailable) - { - break; - } - } - ncclGroupEnd(); - - if (numDevices > numDevicesAvailable) - { - pass = true; - return; - } - - int numProcesses = numDevices / ranks.size(); - Barrier barrier(process, numProcesses, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID")))); - - for (int i = 0; i < ranks.size(); i++) - { - for (int j = 0; j < datasets.size(); j++) - { - FillDatasetWithPattern(*datasets[j], ranks[i]); - } - } - - int const root = 0; - - AllGatherMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[0], barrier, numDevices, ranks); - AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[1], barrier, op, ranks); - BroadcastMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[2], root, ranks); - ReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[3], barrier, op, root, ranks); - ReduceScatterMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[4], barrier, op, ranks); - barrier.Wait(); - - size_t const byteCount = datasets[0]->NumBytes() / numDevices; - size_t const elemCount = numElements / numDevices; - - ncclGroupStart(); - // AllGather - for (int i = 0; i < ranks.size(); i++) - { - int rank = ranks[i]; - ncclAllGather((int8_t *)datasets[0]->inputs[rank] + (rank * byteCount), - datasets[0]->outputs[rank], elemCount, - dataType, comms[rank], streams[rank]); - } - - // AllReduce - for (int i = 0; i < ranks.size(); i++) - { - int rank = ranks[i]; - ncclAllReduce(datasets[1]->inputs[rank], datasets[1]->outputs[rank], - numElements, dataType, op, comms[rank], streams[rank]); - } - - // Broadcast - for (int i = 0; i < ranks.size(); i++) - { - int rank = ranks[i]; - ncclBroadcast(datasets[2]->inputs[rank], - datasets[2]->outputs[rank], - numElements, dataType, - root, comms[rank], streams[rank]); - } - - // Reduce - for (int i = 0; i < ranks.size(); i++) - { - int rank = ranks[i]; - ncclReduce(datasets[3]->inputs[rank], - datasets[3]->outputs[rank], - numElements, dataType, op, - root, comms[rank], streams[rank]); - } - - // ReduceScatter - for (int i = 0; i < ranks.size(); i++) - { - int rank = ranks[i]; - ncclReduceScatter(datasets[4]->inputs[rank], - (int8_t *)datasets[4]->outputs[rank] + (rank * byteCount), - elemCount, dataType, op, - comms[rank], streams[rank]); - } - - // Signal end of group call - ncclGroupEnd(); - - for (int i = 0; i < ranks.size(); i++) - { - HIP_CALL(hipSetDevice(ranks[i])); - HIP_CALL(hipStreamSynchronize(streams[ranks[i]])); - } - - for (int i = 0; i < funcs.size(); i++) - { - for (int j = 0; j < ranks.size(); j++) - { - pass = ValidateResults(*datasets[i], ranks[j], root); - if (!pass) - { - break; - } - } - barrier.Wait(); - for (int j = 0; j < ranks.size(); j++) - { - datasets[i]->Release(ranks[j]); - } - } - - for (int i = 0; i < ranks.size(); i++) - { - TearDownPerProcess(comms[ranks[i]], streams[ranks[i]]); - } - } - }; -} - -#endif diff --git a/test/test_Reduce.cpp b/test/test_Reduce.cpp deleted file mode 100644 index 35b4576e9e..0000000000 --- a/test/test_Reduce.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "test_Reduce.hpp" - -namespace CorrectnessTests -{ - TEST_P(ReduceCorrectnessTest, Correctness) - { - if (numDevices > numDevicesAvailable) return; - - // Allocate data - Dataset dataset; - dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollReduce); - - // Test each possible root - for (int root = 0; root < numDevices; root++) - { - // Prepare input / output / expected results - FillDatasetWithPattern(dataset); - ComputeExpectedResults(dataset, op, root); - - // Launch the reduction (1 thread per GPU) - ncclGroupStart(); - for (int i = 0; i < numDevices; i++) - { - ncclReduce(dataset.inputs[i], - dataset.outputs[i], - numElements, dataType, op, - root, comms[i], streams[i]); - } - ncclGroupEnd(); - - // Wait for reduction to complete - Synchronize(); - - // Check results - ValidateResults(dataset); - } - - dataset.Release(); - } - - INSTANTIATE_TEST_SUITE_P(ReduceCorrectnessSweep, - ReduceCorrectnessTest, - testing::Combine( - // Reduction operator - testing::Values(ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(1024, 1048576), - // Number of devices - testing::Range(2,(GTESTS_NUM_GPUS+1)), - // In-place or not - testing::Values(false, true), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_Reduce.hpp b/test/test_Reduce.hpp deleted file mode 100644 index b4afdab4aa..0000000000 --- a/test/test_Reduce.hpp +++ /dev/null @@ -1,87 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_REDUCE_HPP -#define TEST_REDUCE_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class ReduceCorrectnessTest : public CorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op, int const root) - { - // Copy all inputs to expected arrays temporarily to perform reduction on host - for (int i = 0; i < dataset.numDevices; i++) - HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i], - dataset.NumBytes(), hipMemcpyDeviceToHost)); - - // Allocate temporary host array to accumulate results - int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes()); - uint8_t* resultU1 = (uint8_t *)resultI1; - int32_t* resultI4 = (int32_t *)resultI1; - uint32_t* resultU4 = (uint32_t *)resultI1; - int64_t* resultI8 = (int64_t *)resultI1; - uint64_t* resultU8 = (uint64_t *)resultI1; - float* resultF4 = (float *)resultI1; - double* resultF8 = (double *)resultI1; - rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1; - - // Initialize the result with the first device's array - memcpy(resultI1, dataset.expected[0], dataset.NumBytes()); - ncclRedOp_t red_op = ((op == ncclAvg) ? ncclSum : op); - - // Perform reduction on the other device arrays - for (int i = 1; i < dataset.numDevices; i++) - { - int8_t* arrayI1 = (int8_t *)dataset.expected[i]; - uint8_t* arrayU1 = (uint8_t *)arrayI1; - int32_t* arrayI4 = (int32_t *)arrayI1; - uint32_t* arrayU4 = (uint32_t *)arrayI1; - int64_t* arrayI8 = (int64_t *)arrayI1; - uint64_t* arrayU8 = (uint64_t *)arrayI1; - float* arrayF4 = (float *)arrayI1; - double* arrayF8 = (double *)arrayI1; - rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1; - - for (int j = 0; j < dataset.numElements; j++) - { - switch (dataset.dataType) - { - case ncclInt8: resultI1[j] = ReduceOp(red_op, resultI1[j], arrayI1[j]); break; - case ncclUint8: resultU1[j] = ReduceOp(red_op, resultU1[j], arrayU1[j]); break; - case ncclInt32: resultI4[j] = ReduceOp(red_op, resultI4[j], arrayI4[j]); break; - case ncclUint32: resultU4[j] = ReduceOp(red_op, resultU4[j], arrayU4[j]); break; - case ncclInt64: resultI8[j] = ReduceOp(red_op, resultI8[j], arrayI8[j]); break; - case ncclUint64: resultU8[j] = ReduceOp(red_op, resultU8[j], arrayU8[j]); break; - case ncclFloat32: resultF4[j] = ReduceOp(red_op, resultF4[j], arrayF4[j]); break; - case ncclFloat64: resultF8[j] = ReduceOp(red_op, resultF8[j], arrayF8[j]); break; - case ncclBfloat16: resultB2[j] = ReduceOp(red_op, resultB2[j], arrayB2[j]); break; - default: - fprintf(stderr, "[ERROR] Unsupported datatype\n"); - exit(0); - } - } - } - - if (op == ncclAvg) - Average(dataset, resultI1); - - // Copy results into expected arrays - for (int i = 0; i < dataset.numDevices; i++) - { - if (i == root) - memcpy(dataset.expected[root], resultI1, dataset.NumBytes()); - else - HIP_CALL(hipMemcpy(dataset.expected[i], dataset.outputs[i], dataset.NumBytes(), hipMemcpyDeviceToHost)); - } - free(resultI1); - } - }; -} - -#endif diff --git a/test/test_ReduceMultiProcess.cpp b/test/test_ReduceMultiProcess.cpp deleted file mode 100644 index c98cc5e0ef..0000000000 --- a/test/test_ReduceMultiProcess.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "test_ReduceMultiProcess.hpp" - -namespace CorrectnessTests -{ - TEST_P(ReduceMultiProcessCorrectnessTest, Correctness) - { - dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollReduce); - std::vector pids(numDevices); - - int gpu = -1; - for (int i = 0; i < numDevices; i++) - { - gpu++; - int pid = fork(); - if (pid == 0) - { - bool pass; - TestReduce(gpu, *dataset, pass); - TerminateChildProcess(pass); - } - else - { - pids[gpu] = pid; - } - } - - ValidateProcesses(pids); - dataset->ReleaseRootProcess(); - } - - INSTANTIATE_TEST_SUITE_P(ReduceMultiProcessCorrectnessSweep, - ReduceMultiProcessCorrectnessTest, - testing::Combine( - // Reduction operator - testing::Values(ncclSum, ncclProd, ncclMax, ncclMin), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(1024, 1048576), - // Number of devices - testing::Values(2,3,4,8), - // In-place or not - testing::Values(false, true), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_ReduceMultiProcess.hpp b/test/test_ReduceMultiProcess.hpp deleted file mode 100644 index d2b0ab045f..0000000000 --- a/test/test_ReduceMultiProcess.hpp +++ /dev/null @@ -1,131 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_REDUCE_MULTI_PROCESS_HPP -#define TEST_REDUCE_MULTI_PROCESS_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class ReduceMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, int const root, std::vector const& ranks) - { - // Copy all inputs to expected arrays temporarily to perform reduction on host - for (int i = 0; i < ranks.size(); i++) - { - int rank = ranks[i]; - HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank], - dataset.NumBytes(), hipMemcpyDeviceToHost)); - } - barrier.Wait(); - - for (int h = 0; h < ranks.size(); h++) - { - int rank = ranks[h]; - if (rank == root) - { - // Allocate temporary host array to accumulate results - int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes()); - uint8_t* resultU1 = (uint8_t *)resultI1; - int32_t* resultI4 = (int32_t *)resultI1; - uint32_t* resultU4 = (uint32_t *)resultI1; - int64_t* resultI8 = (int64_t *)resultI1; - uint64_t* resultU8 = (uint64_t *)resultI1; - float* resultF4 = (float *)resultI1; - double* resultF8 = (double *)resultI1; - rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1; - - // Initialize the result with the first device's array - memcpy(resultI1, dataset.expected[0], dataset.NumBytes()); - - // Perform reduction on the other device arrays - for (int i = 1; i < dataset.numDevices; i++) - { - int8_t* arrayI1 = (int8_t *)dataset.expected[i]; - uint8_t* arrayU1 = (uint8_t *)arrayI1; - int32_t* arrayI4 = (int32_t *)arrayI1; - uint32_t* arrayU4 = (uint32_t *)arrayI1; - int64_t* arrayI8 = (int64_t *)arrayI1; - uint64_t* arrayU8 = (uint64_t *)arrayI1; - float* arrayF4 = (float *)arrayI1; - double* arrayF8 = (double *)arrayI1; - rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1; - - for (int j = 0; j < dataset.numElements; j++) - { - switch (dataset.dataType) - { - case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break; - case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break; - case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break; - case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break; - case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break; - case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break; - case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break; - case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break; - case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break; - default: - fprintf(stderr, "[ERROR] Unsupported datatype\n"); - exit(0); - } - } - } - memcpy(dataset.expected[root], resultI1, dataset.NumBytes()); - free(resultI1); - } - } - barrier.Wait(); - - for (int i = 0; i < ranks.size(); i++) - { - int rank = ranks[i]; - if (rank != root) - { - HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.outputs[rank], dataset.NumBytes(), hipMemcpyDeviceToHost)); - } - } - } - - void TestReduce(int rank, Dataset& dataset, bool& pass) - { - SetUpPerProcess(rank, ncclCollReduce, comms[rank], streams[rank], dataset); - - if (numDevices > numDevicesAvailable) - { - pass = true; - return; - } - - Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID")))); - - // Test each possible root - for (int root = 0; root < numDevices; root++) - { - // Prepare input / output / expected results - FillDatasetWithPattern(dataset, rank); - ComputeExpectedResults(dataset, barrier, op, root, std::vector(1, rank)); - // Launch the reduction (1 process per GPU) - ncclResult_t res = ncclReduce(dataset.inputs[rank], - dataset.outputs[rank], - numElements, dataType, op, - root, comms[rank], streams[rank]); - // Wait for reduction to complete - HIP_CALL(hipStreamSynchronize(streams[rank])); - // Check results - pass = ValidateResults(dataset, rank); - // Ensure all processes have finished current iteration before proceeding - barrier.Wait(); - } - - TearDownPerProcess(comms[rank], streams[rank]); - dataset.Release(rank); - } - }; -} - -#endif diff --git a/test/test_ReduceScatter.cpp b/test/test_ReduceScatter.cpp deleted file mode 100644 index 7462a3ab6b..0000000000 --- a/test/test_ReduceScatter.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "test_ReduceScatter.hpp" - -namespace CorrectnessTests -{ - TEST_P(ReduceScatterCorrectnessTest, Correctness) - { - // Adjust numElements to be multiple of numDevices - numElements = (numElements/numDevices)*numDevices; - if (numDevices > numDevicesAvailable) return; - if (numElements % numDevices != 0) return; - - // Prepare input / output / expected results - Dataset dataset; - dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollReduceScatter); - FillDatasetWithPattern(dataset); - ComputeExpectedResults(dataset, op); - - size_t const byteCount = dataset.NumBytes() / dataset.numDevices; - size_t const recvCount = dataset.numElements / dataset.numDevices; - - // Launch the reduction (1 thread per GPU) - ncclGroupStart(); - for (int i = 0; i < numDevices; i++) - { - ncclReduceScatter(dataset.inputs[i], - (int8_t *)dataset.outputs[i] + (i * byteCount), - recvCount, dataType, op, - comms[i], streams[i]); - } - ncclGroupEnd(); - - // Wait for reduction to complete - Synchronize(); - - // Check results - ValidateResults(dataset); - - dataset.Release(); - } - - INSTANTIATE_TEST_SUITE_P(ReduceScatterCorrectnessSweep, - ReduceScatterCorrectnessTest, - testing::Combine( - // Reduction operator - testing::Values(ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(2520, 3026520), - // Number of devices - testing::Range(2,(GTESTS_NUM_GPUS+1)), - // In-place or not - testing::Values(false, true), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_ReduceScatter.hpp b/test/test_ReduceScatter.hpp deleted file mode 100644 index 3ba54b2983..0000000000 --- a/test/test_ReduceScatter.hpp +++ /dev/null @@ -1,90 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_REDUCE_SCATTER_HPP -#define TEST_REDUCE_SCATTER_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class ReduceScatterCorrectnessTest : public CorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op) - { - // Copy all inputs to expected arrays temporarily to perform reduction on host - for (int i = 0; i < dataset.numDevices; i++) - HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i], - dataset.NumBytes(), hipMemcpyDeviceToHost)); - - // Allocate temporary host array to accumulate results - int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes()); - uint8_t* resultU1 = (uint8_t *)resultI1; - int32_t* resultI4 = (int32_t *)resultI1; - uint32_t* resultU4 = (uint32_t *)resultI1; - int64_t* resultI8 = (int64_t *)resultI1; - uint64_t* resultU8 = (uint64_t *)resultI1; - float* resultF4 = (float *)resultI1; - double* resultF8 = (double *)resultI1; - rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1; - - // Initialize the result with the first device's array - memcpy(resultI1, dataset.expected[0], dataset.NumBytes()); - ncclRedOp_t red_op = ((op == ncclAvg) ? ncclSum : op); - - // Perform reduction on the other device arrays - for (int i = 1; i < dataset.numDevices; i++) - { - int8_t* arrayI1 = (int8_t *)dataset.expected[i]; - uint8_t* arrayU1 = (uint8_t *)arrayI1; - int32_t* arrayI4 = (int32_t *)arrayI1; - uint32_t* arrayU4 = (uint32_t *)arrayI1; - int64_t* arrayI8 = (int64_t *)arrayI1; - uint64_t* arrayU8 = (uint64_t *)arrayI1; - float* arrayF4 = (float *)arrayI1; - double* arrayF8 = (double *)arrayI1; - rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1; - - for (int j = 0; j < dataset.numElements; j++) - { - switch (dataset.dataType) - { - case ncclInt8: resultI1[j] = ReduceOp(red_op, resultI1[j], arrayI1[j]); break; - case ncclUint8: resultU1[j] = ReduceOp(red_op, resultU1[j], arrayU1[j]); break; - case ncclInt32: resultI4[j] = ReduceOp(red_op, resultI4[j], arrayI4[j]); break; - case ncclUint32: resultU4[j] = ReduceOp(red_op, resultU4[j], arrayU4[j]); break; - case ncclInt64: resultI8[j] = ReduceOp(red_op, resultI8[j], arrayI8[j]); break; - case ncclUint64: resultU8[j] = ReduceOp(red_op, resultU8[j], arrayU8[j]); break; - case ncclFloat32: resultF4[j] = ReduceOp(red_op, resultF4[j], arrayF4[j]); break; - case ncclFloat64: resultF8[j] = ReduceOp(red_op, resultF8[j], arrayF8[j]); break; - case ncclBfloat16: resultB2[j] = ReduceOp(red_op, resultB2[j], arrayB2[j]); break; - default: - fprintf(stderr, "[ERROR] Unsupported datatype\n"); - exit(0); - } - } - } - - if (op == ncclAvg) - Average(dataset, resultI1); - - // Copy results into expected arrays - size_t const byteCount = dataset.NumBytes() / dataset.numDevices; - - for (int i = 0; i < dataset.numDevices; i++) - HIP_CALL(hipMemcpy(dataset.expected[i], dataset.outputs[i], - dataset.NumBytes(), hipMemcpyDeviceToHost)); - - for (int i = 0; i < dataset.numDevices; i++) - memcpy((int8_t *)dataset.expected[i] + (i * byteCount), - resultI1 + (i * byteCount), byteCount); - - free(resultI1); - } - }; -} - -#endif diff --git a/test/test_ReduceScatterMultiProcess.cpp b/test/test_ReduceScatterMultiProcess.cpp deleted file mode 100644 index 1d101712f5..0000000000 --- a/test/test_ReduceScatterMultiProcess.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "test_ReduceScatterMultiProcess.hpp" - -namespace CorrectnessTests -{ - TEST_P(ReduceScatterMultiProcessCorrectnessTest, Correctness) - { - dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollReduceScatter); - std::vector pids(numDevices); - - int gpu = -1; - for (int i = 0; i < numDevices; i++) - { - gpu++; - int pid = fork(); - if (pid == 0) - { - bool pass; - TestReduceScatter(gpu, *dataset, pass); - TerminateChildProcess(pass); - } - else - { - pids[gpu] = pid; - } - } - - ValidateProcesses(pids); - dataset->ReleaseRootProcess(); - } - - INSTANTIATE_TEST_SUITE_P(ReduceScatterMultiProcessCorrectnessSweep, - ReduceScatterMultiProcessCorrectnessTest, - testing::Combine( - // Reduction operator - testing::Values(ncclSum, ncclProd, ncclMax, ncclMin), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(3072, 3145728), - // Number of devices - testing::Values(2,3,4,8), - // In-place or not - testing::Values(false, true), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_ReduceScatterMultiProcess.hpp b/test/test_ReduceScatterMultiProcess.hpp deleted file mode 100644 index 5921c0bd94..0000000000 --- a/test/test_ReduceScatterMultiProcess.hpp +++ /dev/null @@ -1,146 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_REDUCE_SCATTER_MULTI_PROCESS_HPP -#define TEST_REDUCE_SCATTER_MULTI_PROCESS_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class ReduceScatterMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, std::vector const& ranks) - { - // Copy all inputs to expected arrays temporarily to perform reduction on host - for (int i = 0; i < ranks.size(); i++) - { - int rank = ranks[i]; - HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank], - dataset.NumBytes(), hipMemcpyDeviceToHost)); - } - barrier.Wait(); - - // Have rank 0 do the expected calculation, then send results to other processes - int8_t* resultI1; - for (int h = 0; h < ranks.size(); h++) - { - int rank = ranks[h]; - if (rank == 0) - { - // Allocate temporary host array to accumulate results - resultI1 = (int8_t *)malloc(dataset.NumBytes()); - uint8_t* resultU1 = (uint8_t *)resultI1; - int32_t* resultI4 = (int32_t *)resultI1; - uint32_t* resultU4 = (uint32_t *)resultI1; - int64_t* resultI8 = (int64_t *)resultI1; - uint64_t* resultU8 = (uint64_t *)resultI1; - float* resultF4 = (float *)resultI1; - double* resultF8 = (double *)resultI1; - rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1; - - // Initialize the result with the first device's array - memcpy(resultI1, dataset.expected[0], dataset.NumBytes()); - - // Perform reduction on the other device arrays - for (int i = 1; i < dataset.numDevices; i++) - { - int8_t* arrayI1 = (int8_t *)dataset.expected[i]; - uint8_t* arrayU1 = (uint8_t *)arrayI1; - int32_t* arrayI4 = (int32_t *)arrayI1; - uint32_t* arrayU4 = (uint32_t *)arrayI1; - int64_t* arrayI8 = (int64_t *)arrayI1; - uint64_t* arrayU8 = (uint64_t *)arrayI1; - float* arrayF4 = (float *)arrayI1; - double* arrayF8 = (double *)arrayI1; - rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1; - - for (int j = 0; j < dataset.numElements; j++) - { - switch (dataset.dataType) - { - case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break; - case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break; - case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break; - case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break; - case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break; - case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break; - case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break; - case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break; - case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break; - default: - fprintf(stderr, "[ERROR] Unsupported datatype\n"); - exit(0); - } - } - } - } - } - barrier.Wait(); - // Copy results into expected arrays - size_t const byteCount = dataset.NumBytes() / dataset.numDevices; - - for (int i = 0; i < ranks.size(); i++) - { - int rank = ranks[i]; - HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.outputs[rank], - dataset.NumBytes(), hipMemcpyDeviceToHost)); - } - barrier.Wait(); - - for (int h = 0; h < ranks.size(); h++) - { - int rank = ranks[h]; - if (rank == 0) - { - for (int i = 0; i < dataset.numDevices; i++) - memcpy((int8_t *)dataset.expected[i] + (i * byteCount), - resultI1 + (i * byteCount), byteCount); - - free(resultI1); - } - } - } - - void TestReduceScatter(int rank, Dataset& dataset, bool& pass) - { - // Prepare input / output / expected results - SetUpPerProcess(rank, ncclCollAllGather, comms[rank], streams[rank], dataset); - - if (numDevices > numDevicesAvailable || numElements % numDevices != 0) - { - pass = true; - return; - } - - Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID")))); - - // Prepare input / output / expected results - FillDatasetWithPattern(dataset, rank); - ComputeExpectedResults(dataset, barrier, op, std::vector(1, rank)); - - size_t const byteCount = dataset.NumBytes() / numDevices; - size_t const recvCount = dataset.numElements / numDevices; - - // Launch the reduction (1 process per GPU) - ncclReduceScatter(dataset.inputs[rank], - (int8_t *)dataset.outputs[rank] + (rank * byteCount), - recvCount, dataType, op, - comms[rank], streams[rank]); - - // Wait for reduction to complete - HIP_CALL(hipStreamSynchronize(streams[rank])); - - // Check results - pass = ValidateResults(dataset, rank); - - TearDownPerProcess(comms[rank], streams[rank]); - dataset.Release(rank); - } - }; -} - -#endif diff --git a/test/test_Scatter.cpp b/test/test_Scatter.cpp deleted file mode 100644 index 8ceec37589..0000000000 --- a/test/test_Scatter.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "test_Scatter.hpp" - -namespace CorrectnessTests -{ - TEST_P(ScatterCorrectnessTest, Correctness) - { - if (numDevices > numDevicesAvailable) return; - - // Allocate data - Dataset dataset; - dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollScatter); - - // Test each possible root - for (int root = 0; root < numDevices; root++) - { - // Prepare input / output / expected results - FillDatasetWithPattern(dataset); - ComputeExpectedResults(dataset, root); - - // Launch the reduction (1 thread per GPU) - ncclGroupStart(); - for (int i = 0; i < numDevices; i++) - { - ncclScatter(dataset.inputs[i], - dataset.outputs[i], - numElements, dataType, - root, comms[i], streams[i]); - } - ncclGroupEnd(); - - // Wait for reduction to complete - Synchronize(); - - // Check results - ValidateResults(dataset); - } - - dataset.Release(); - } - - INSTANTIATE_TEST_SUITE_P(ScatterCorrectnessSweep, - ScatterCorrectnessTest, - testing::Combine( - // Reduction operator is not used - testing::Values(ncclSum), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(1024, 1048576), - // Number of devices - testing::Range(2,(GTESTS_NUM_GPUS+1)), - // In-place or not - testing::Values(false), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_Scatter.hpp b/test/test_Scatter.hpp deleted file mode 100644 index a7e695c28b..0000000000 --- a/test/test_Scatter.hpp +++ /dev/null @@ -1,25 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_SCATTER_HPP -#define TEST_SCATTER_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class ScatterCorrectnessTest : public CorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset, int const root) - { - for (int i = 0; i < dataset.numDevices; i++) - HIP_CALL(hipMemcpy(dataset.expected[i], (int8_t *)dataset.inputs[root]+dataset.NumBytes()*i, - dataset.NumBytes(), hipMemcpyDeviceToHost)); - } - }; -} - -#endif diff --git a/test/test_ScatterMultiProcess.cpp b/test/test_ScatterMultiProcess.cpp deleted file mode 100644 index b23bc2f599..0000000000 --- a/test/test_ScatterMultiProcess.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "test_ScatterMultiProcess.hpp" - -namespace CorrectnessTests -{ - TEST_P(ScatterMultiProcessCorrectnessTest, Correctness) - { - dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollScatter); - std::vector pids(numDevices); - - int gpu = -1; - for (int i = 0; i < numDevices; i++) - { - gpu++; - int pid = fork(); - if (pid == 0) - { - bool pass; - TestScatter(gpu, *dataset, pass); - TerminateChildProcess(pass); - } - else - { - pids[gpu] = pid; - } - } - - ValidateProcesses(pids); - dataset->ReleaseRootProcess(); - } - - INSTANTIATE_TEST_SUITE_P(ScatterMultiProcessCorrectnessSweep, - ScatterMultiProcessCorrectnessTest, - testing::Combine( - // Reduction operator is not used - testing::Values(ncclSum), - // Data types - testing::Values(ncclInt8, - ncclUint8, - ncclInt32, - ncclUint32, - ncclInt64, - ncclUint64, - //ncclFloat16, - ncclFloat32, - ncclFloat64, - ncclBfloat16), - // Number of elements - testing::Values(1024, 1048576), - // Number of devices - testing::Values(2,3,4,8), - // In-place or not - testing::Values(false), - testing::Values("")), - CorrectnessTest::PrintToStringParamName()); -} // namespace diff --git a/test/test_ScatterMultiProcess.hpp b/test/test_ScatterMultiProcess.hpp deleted file mode 100644 index 332774f3d3..0000000000 --- a/test/test_ScatterMultiProcess.hpp +++ /dev/null @@ -1,68 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#ifndef TEST_SCATTER_MULTI_PROCESS_HPP -#define TEST_SCATTER_MULTI_PROCESS_HPP - -#include "CorrectnessTest.hpp" - -namespace CorrectnessTests -{ - class ScatterMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest - { - public: - static void ComputeExpectedResults(Dataset& dataset, int const root, int const rank) - { - if (rank == root) - { - for (int i = 0; i < dataset.numDevices; i++) - HIP_CALL(hipMemcpy(dataset.expected[i], (int8_t *)dataset.inputs[root]+dataset.NumBytes()*i, - dataset.NumBytes(), hipMemcpyDeviceToHost)); - } - } - - void TestScatter(int rank, Dataset& dataset, bool& pass) - { - // Prepare input / output / expected results - SetUpPerProcess(rank, ncclCollScatter, comms[rank], streams[rank], dataset); - - if (numDevices > numDevicesAvailable) - { - pass = true; - return; - } - - Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID")))); - - // Test each possible root - for (int root = 0; root < numDevices; root++) - { - // Prepare input / output / expected results - FillDatasetWithPattern(dataset, rank); - - ComputeExpectedResults(dataset, root, rank); - - // Launch the reduction (1 process per GPU) - ncclScatter(dataset.inputs[rank], - dataset.outputs[rank], - numElements, dataType, - root, comms[rank], streams[rank]); - - // Wait for reduction to complete - HIP_CALL(hipStreamSynchronize(streams[rank])); - - // Check results - pass = ValidateResults(dataset, rank); - - barrier.Wait(); - } - - TearDownPerProcess(comms[rank], streams[rank]); - dataset.Release(rank); - } - }; -} - -#endif