/************************************************************************* * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. * * See LICENSE.txt for license information ************************************************************************/ #include #include "TestBed.hpp" extern "C" bool mscclUnitTestMode() { return true; } namespace RcclUnitTesting { TEST(AllReduce, MscclSingleCall) { TestBed testBed; // Configuration std::vector const funcTypes = {ncclCollAllReduce}; std::vector const dataTypes = {ncclInt8, ncclInt32, ncclFloat32}; std::vector const redOps = {ncclSum, ncclProd}; std::vector const roots = {0}; std::vector const numElements = {384 * 32 * 32, 384 * 32, 384}; std::vector const inPlaceList = {true, false}; std::vector const managedMemList = {true, false}; std::vector const useHipGraphList = {false, true}; testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList, useHipGraphList); testBed.Finalize(); } TEST(AllReduce, MscclGroupCall) { TestBed testBed; // Configuration ncclFunc_t const funcType = ncclCollAllReduce; std::vector const& dataTypes = {ncclFloat}; std::vector const& redOps = {ncclSum}; std::vector const numElements = {384}; bool const inPlace = false; bool const useManagedMem = false; int const numCollPerGroup = numElements.size(); OptionalColArgs options; // This tests runs 3 collectives in the same group call bool isCorrect = true; for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks) for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess) { if (!(testBed.ev.processMask & (1 << isMultiProcess))) continue; // Test either single process all GPUs, or 1 process per GPU int const numProcesses = isMultiProcess ? totalRanks : 1; testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup); for (int redOpIdx = 0; redOpIdx < redOps.size() && isCorrect; ++redOpIdx) { options.redOp = redOps[redOpIdx]; for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) { if (testBed.ev.showNames) INFO("%s %d-ranks AllReduce %d Grouped Calls (%s-%s)\n", isMultiProcess ? "MP" : "SP", totalRanks, numCollPerGroup, ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]); // Run all element sizes in parallel as single group for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx) { testBed.SetCollectiveArgs(funcType, dataTypes[dataIdx], numElements[collIdx], numElements[collIdx], options, collIdx); } testBed.AllocateMem(inPlace, useManagedMem); testBed.PrepareData(); testBed.ExecuteCollectives(); testBed.ValidateResults(isCorrect); testBed.DeallocateMem(); } } testBed.DestroyComms(); } testBed.Finalize(); } TEST(AllReduce, MscclPreMultScalar) { TestBed testBed; // Configuration ncclFunc_t const funcType = ncclCollAllReduce; std::vector const& dataTypes = {ncclInt32, ncclFloat32, ncclFloat64}; ncclRedOp_t const redOp = ncclSum; std::vector const numElements = {384 * 32 * 32, 384 * 32, 384}; bool const inPlace = false; bool const useManagedMem = false; OptionalColArgs options; // Terminate the test as soon as first failure occurs bool isCorrect = true; for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks) for (int isMultiProcess = 0; isMultiProcess <= 1; ++isMultiProcess) { if (!(testBed.ev.processMask & (1 << isMultiProcess))) continue; int const numProcesses = isMultiProcess ? totalRanks : 1; testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks)); for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) { ncclDataType_t const dataType = dataTypes[dataIdx]; // Set scalars per rank PtrUnion scalarsPerRank; scalarsPerRank.AllocateCpuMem(totalRanks * DataTypeToBytes(dataType)); for (int i = 0; i < totalRanks; i++) { double F = i; scalarsPerRank.Set(dataType, i, i, F); } int const numBytes = totalRanks * DataTypeToBytes(dataType); memcpy(options.scalarTransport.ptr, scalarsPerRank.ptr, numBytes); // Test various scalar residence modes for (int scalarMode = 0; scalarMode <= 1 && isCorrect; ++scalarMode) { if (testBed.ev.showNames) INFO("%s %d-ranks AllReduce (custom-scalar Mode %d %s)\n", isMultiProcess ? "MP" : "SP", totalRanks, scalarMode, ncclDataTypeNames[dataType]); for (int i = 0; i < numElements.size() && isCorrect; ++i) { options.scalarMode = scalarMode; options.redOp = redOp; testBed.SetCollectiveArgs(funcType, dataType, numElements[i], numElements[i], options); // For performance, only allocate and prepare data on largest size if (i == 0) { testBed.AllocateMem(inPlace, useManagedMem); testBed.PrepareData(); } testBed.ExecuteCollectives(); testBed.ValidateResults(isCorrect); } testBed.DeallocateMem(); } } testBed.DestroyComms(); } testBed.Finalize(); } TEST(AllReduce, MscclMultiStream) { TestBed testBed; // Configuration int const numElements = 384 * 1024; bool const inPlace = false; bool const useManagedMem = false; OptionalColArgs options; // This test runs multiple AllReduce collectives on different streams within the same group call bool isCorrect = true; for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks) for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess) { if (!(testBed.ev.processMask & (1 << isMultiProcess))) continue; // Test either single process all GPUs, or 1 process per GPU int const numProcesses = isMultiProcess ? totalRanks : 1; for (int numCollPerGroup = 2; numCollPerGroup <= 6; numCollPerGroup += 2) { for (int numStreamsPerGroup = numCollPerGroup; numStreamsPerGroup >= 2; numStreamsPerGroup -= 3) { if (testBed.ev.showNames) INFO("%s %d-ranks Multistream %d-Group Calls across %d streams\n", isMultiProcess ? "MP" : "SP", totalRanks, numCollPerGroup, numStreamsPerGroup); testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup, false, numStreamsPerGroup); // Set up each collective in group in different stream (modulo numStreamsPerGroup) options.redOp = ncclSum; for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx) { testBed.SetCollectiveArgs(ncclCollAllReduce, ncclFloat, numElements, numElements, options, collIdx, -1, collIdx % numStreamsPerGroup); } testBed.AllocateMem(inPlace, useManagedMem); testBed.PrepareData(); testBed.ExecuteCollectives(); testBed.ValidateResults(isCorrect); testBed.DeallocateMem(); testBed.DestroyComms(); } } } testBed.Finalize(); } }