From a85f71a4219a6cfa0bf1b7fe25e9891c7320e319 Mon Sep 17 00:00:00 2001 From: Pedram Alizadeh Date: Wed, 14 Dec 2022 11:28:40 -0500 Subject: [PATCH] Revert "UnitTest: add test cases for 2.14 API (ncclCommInitRankConfig and ncclCommFinalize for non-blocking communicator) (#662)" (#666) This reverts commit f29aa66d4ff2d4859dda202361beb40da252f297. [ROCm/rccl commit: 54a3da04ebe9518280c3def4007a1c91341ba746] --- .../rccl/test/AllReduce_NonBlockingConf.cpp | 64 ----------------- projects/rccl/test/CMakeLists.txt | 2 - projects/rccl/test/common/TestBed.cpp | 10 +-- projects/rccl/test/common/TestBed.hpp | 8 +-- projects/rccl/test/common/TestBedChild.cpp | 69 +------------------ projects/rccl/test/common/TestBedChild.hpp | 2 - 6 files changed, 9 insertions(+), 146 deletions(-) delete mode 100644 projects/rccl/test/AllReduce_NonBlockingConf.cpp diff --git a/projects/rccl/test/AllReduce_NonBlockingConf.cpp b/projects/rccl/test/AllReduce_NonBlockingConf.cpp deleted file mode 100644 index 2730b719c3..0000000000 --- a/projects/rccl/test/AllReduce_NonBlockingConf.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/************************************************************************* - * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ -#include "TestBed.hpp" -namespace RcclUnitTesting -{ - TEST(AllReduce, NonBlocking) - { - TestBed testBed; - // Configuration - ncclFunc_t const funcType = ncclCollAllReduce; - std::vector const& dataTypes = {ncclFloat}; - std::vector const& redOps = {ncclSum}; - std::vector const numElements = {1048576, 1024}; - bool const inPlace = false; - bool const useManagedMem = false; - bool const useBlocking = false; - - OptionalColArgs options; - // Terminate the test as soon as first failure occurs - bool isCorrect = true; - for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks) - for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess) - { - if (!(testBed.ev.processMask & (1 << isMultiProcess))) continue; - - // Test either single process all GPUs, or 1 process per GPU - int const numProcesses = isMultiProcess ? totalRanks : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), 1, useBlocking); - - for (int redOpIdx = 0; redOpIdx < redOps.size() && isCorrect; ++redOpIdx) - { - options.redOp = redOps[redOpIdx]; - for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) - { - if (testBed.ev.showNames) - INFO("%s %d-ranks AllReduce %s Blocking Config (%s-%s)\n", - isMultiProcess ? "MP" : "SP", - totalRanks, useBlocking ? "true" : "false", - ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]); - - - for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx) - { - testBed.SetCollectiveArgs(funcType, - dataTypes[dataIdx], - numElements[numIdx], - numElements[numIdx], - options); - } - testBed.AllocateMem(inPlace, useManagedMem); - testBed.PrepareData(); - testBed.ExecuteCollectives(); - testBed.ValidateResults(isCorrect); - testBed.DeallocateMem(); - } - } - testBed.DestroyComms(); - } - testBed.Finalize(); - } -} diff --git a/projects/rccl/test/CMakeLists.txt b/projects/rccl/test/CMakeLists.txt index ff40106293..758c8d09e9 100644 --- a/projects/rccl/test/CMakeLists.txt +++ b/projects/rccl/test/CMakeLists.txt @@ -46,7 +46,6 @@ if(BUILD_TESTS) set(TEST_SOURCE_FILES AllReduce_Clique.cpp AllReduce_GroupCall.cpp - AllReduce_NonBlockingConf.cpp AllReduce_InPlace.cpp AllReduce_ManagedMem.cpp AllReduce_OutOfPlace.cpp @@ -58,7 +57,6 @@ if(BUILD_TESTS) #AllReduce AllReduce_Clique.cpp AllReduce_GroupCall.cpp - AllReduce_NonBlockingConf.cpp AllReduce_InPlace.cpp AllReduce_ManagedMem.cpp AllReduce_OutOfPlace.cpp diff --git a/projects/rccl/test/common/TestBed.cpp b/projects/rccl/test/common/TestBed.cpp index f922929237..f65a1ee59b 100644 --- a/projects/rccl/test/common/TestBed.cpp +++ b/projects/rccl/test/common/TestBed.cpp @@ -85,13 +85,12 @@ namespace RcclUnitTesting } void TestBed::InitComms(std::vector> const& deviceIdsPerProcess, - int const numCollectivesInGroup, bool const useBlocking) + int const numCollectivesInGroup) { // Count up the total number of GPUs to use and track child/deviceId per rank this->numActiveChildren = deviceIdsPerProcess.size(); this->numActiveRanks = 0; this->numCollectivesInGroup = numCollectivesInGroup; - this->useBlocking = useBlocking; this->rankToChildMap.clear(); this->rankToDeviceMap.clear(); if (ev.verbose) INFO("Setting up %d active child processes\n", this->numActiveChildren); @@ -140,9 +139,6 @@ namespace RcclUnitTesting // Send the number of collectives to be run per group call PIPE_WRITE(childId, numCollectivesInGroup); - // Send the RCCL communication with blocking or non-blocking option - PIPE_WRITE(childId, useBlocking); - // Send whether to use MultiRank interfaces or not. PIPE_WRITE(childId, useMulti); @@ -163,9 +159,9 @@ namespace RcclUnitTesting } } - void TestBed::InitComms(int const numGpus, int const numCollectivesInGroup, bool const useBlocking) + void TestBed::InitComms(int const numGpus, int const numCollectivesInGroup) { - InitComms(TestBed::GetDeviceIdsList(1, numGpus), numCollectivesInGroup, useBlocking); + InitComms(TestBed::GetDeviceIdsList(1, numGpus), numCollectivesInGroup); } void TestBed::SetCollectiveArgs(ncclFunc_t const funcType, diff --git a/projects/rccl/test/common/TestBed.hpp b/projects/rccl/test/common/TestBed.hpp index ccb22a177e..e72daebae3 100644 --- a/projects/rccl/test/common/TestBed.hpp +++ b/projects/rccl/test/common/TestBed.hpp @@ -25,7 +25,7 @@ namespace RcclUnitTesting int numActiveChildren; // List of active children (with usable RCCL comms) int numActiveRanks; // Current # of ranks in use int numCollectivesInGroup; // # of collectives to execute per group call - bool useBlocking; // RCCL communication with blocking or non-blocking option + EnvVars ev; // Environment variables // Constructor - Creates one child process per detected GPU device that waits for further commands @@ -33,11 +33,11 @@ namespace RcclUnitTesting // Prepare TestBed for use with GPUs across multiple child processes void InitComms(std::vector> const& deviceIdsPerChild, - int const numCollectivesInGroup = 1, bool const useBlocking = true); - + int const numCollectivesInGroup = 1); + // Prepare TestBed for use with GPUs on a single child process void InitComms(int const numGpus, - int const numCollectivesInGroup = 1, bool const useBlocking = true); + int const numCollectivesInGroup = 1); // Set collectives arguments for specified collective / rank // Setting scalarsPerRank to non-null will create custom reduction operator diff --git a/projects/rccl/test/common/TestBedChild.cpp b/projects/rccl/test/common/TestBedChild.cpp index b829530123..d6437effc5 100644 --- a/projects/rccl/test/common/TestBedChild.cpp +++ b/projects/rccl/test/common/TestBedChild.cpp @@ -20,26 +20,6 @@ } \ } -#define CHILD_NCCL_CALL_NON_BLOCKING(msg) \ - { \ - for (int i = 0; i < this->comms.size(); ++i) \ - { \ - ncclResult_t ncclAsyncErr; \ - int loop_counter = 0; \ - do \ - { \ - loop_counter++; \ - if (loop_counter == MAX_LOOP_COUNTER) break; \ - ncclCommGetAsyncError(this->comms[i], &ncclAsyncErr); \ - } while(ncclAsyncErr == ncclInProgress); \ - if (ncclAsyncErr != ncclSuccess) \ - { \ - ERROR("Child process %d fails NCCL call %s with code %d\n", this->childId, msg, ncclAsyncErr); \ - return TEST_FAIL; \ - } \ - } \ - } - #define PIPE_READ(val) \ if (read(childReadFd, &val, sizeof(val)) != sizeof(val)) return TEST_FAIL; @@ -146,7 +126,6 @@ namespace RcclUnitTesting PIPE_READ(this->totalRanks); PIPE_READ(this->rankOffset); PIPE_READ(this->numCollectivesInGroup); - PIPE_READ(this->useBlocking); bool useMultiRankPerGpu; PIPE_READ(useMultiRankPerGpu); @@ -198,18 +177,6 @@ namespace RcclUnitTesting break; } } - else if (this->useBlocking == false) - { - // When non-blocking communicator is desired call ncclCommInitRankConfig with appropriate flag - ncclConfig_t config = NCCL_CONFIG_INITIALIZER; - config.blocking = 0; - if (ncclCommInitRankConfig(&this->comms[localRank], this->totalRanks, id, globalRank, &config) != ncclSuccess) - { - ERROR("Rank %d on child %d unable to call ncclCommInitRankConfig\n", globalRank, this->childId); - status = TEST_FAIL; - break; - } - } else { if (ncclCommInitRank(&this->comms[localRank], this->totalRanks, id, globalRank) != ncclSuccess) @@ -220,26 +187,10 @@ namespace RcclUnitTesting } } } - if (this->useBlocking == false) - { - CHILD_NCCL_CALL_NON_BLOCKING("ncclCommGetAsyncErrorInitRankConfig"); - } if (status == TEST_SUCCESS) - { - // Check if the communicator is non-blocking - if (this->useBlocking == false) - { - // handle the ncclGroupEnd in case of non-blocking communication - ncclResult_t Group_End_state = ncclGroupEnd(); - if (Group_End_state != ncclSuccess) CHILD_NCCL_CALL_NON_BLOCKING("ncclCommGetAsyncErrorGroup"); - } - else - { - // In case of blocking communication just call ncclGroupEnd - CHILD_NCCL_CALL(ncclGroupEnd(), "ncclGroupEnd"); - } + { + CHILD_NCCL_CALL(ncclGroupEnd(), "ncclGroupStart"); } - if (this->verbose) INFO("Child %d finishes InitComms() [%s]\n", this->childId, status == TEST_SUCCESS ? "SUCCESS" : "FAIL"); return status; @@ -729,22 +680,6 @@ namespace RcclUnitTesting if (this->verbose) INFO("Child %d begins DestroyComms\n", this->childId); // Release comms - for (int i = 0; i < this->comms.size(); ++i) - { - // Check if the communicator is non-blocking - if (this->useBlocking == false) - { - // handle the non-blocking case - ncclCommFinalize(this->comms[i]); - CHILD_NCCL_CALL_NON_BLOCKING("ncclCommGetAsyncErrorCommFinalize"); - } - else - { - // In case of blocking just call Finalize - CHILD_NCCL_CALL(ncclCommFinalize(this->comms[i]), "ncclCommFinalize"); - } - } - for (int i = 0; i < this->comms.size(); ++i) { CHILD_NCCL_CALL(ncclCommDestroy(this->comms[i]), "ncclCommDestroy"); diff --git a/projects/rccl/test/common/TestBedChild.hpp b/projects/rccl/test/common/TestBedChild.hpp index 2a0e43e6a6..43b511317e 100644 --- a/projects/rccl/test/common/TestBedChild.hpp +++ b/projects/rccl/test/common/TestBedChild.hpp @@ -12,7 +12,6 @@ #include "rccl/rccl.h" #define MAX_RANKS 32 -#define MAX_LOOP_COUNTER 1000000000 namespace RcclUnitTesting { class TestBedChild @@ -64,7 +63,6 @@ namespace RcclUnitTesting int totalRanks; // Total ranks int rankOffset; // Global rank offset for this child int numCollectivesInGroup; // # of collectives to run per group call - bool useBlocking; // RCCL communication with blocking or non-blocking option std::vector comms; // RCCL communicators for each rank std::vector deviceIds; // Device IDs for each rank std::vector streams; // Streams for executing collectives