Unit test refactor (#500)

Refactoring and consolidating single-process / multi-process unit testing
2022-02-25 08:59:07 -07:00
commit 29ad0f5fbe
@@ -21,7 +21,7 @@ def runTestCommand (platform, project, gfilter)
    def command = """#!/usr/bin/env bash
                set -x
                cd ${project.paths.project_build_prefix}/build/release/test
-                ${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
+                ${sudo} UT_SHOW_NAMES=1 HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
            """

   platform.runCommand(this, command)
@@ -9,12 +9,12 @@ import com.amd.project.*
 import com.amd.docker.*
 import java.nio.file.Path

-def runCI = 
+def runCI =
 {
    nodeDetails, jobName->

    def prj  = new rocProject('rccl', 'PreCheckin')
-    
+
    prj.timeout.test = 1440
    prj.paths.build_command = './install.sh -t '

@@ -32,25 +32,25 @@ def runCI =
        commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
        commonGroovy.runCompileCommand(platform, project, jobName)
    }
-    
+
    def testCommand =
    {
        platform, project->

-        commonGroovy.runTestCommand(platform, project, "*sum_float32*")
+        commonGroovy.runTestCommand(platform, project, "*")
    }

    def packageCommand =
    {
        platform, project->
-        
+
        commonGroovy.runPackageCommand(platform, project, jobName)
    }

    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
 }

-ci: { 
+ci: {
    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)

    def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
@@ -58,17 +58,17 @@ ci: {
    propertyList = auxiliary.appendPropertyList(propertyList)

    def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([sles15sp1:['4gfx906'],centos8:['8gfx908'],centos7:['8gfx906'],ubuntu18:['4gfx906', '4gfx908']])]
-    
+
    jobNameList = auxiliary.appendJobNameList(jobNameList)
-    
-    propertyList.each 
+
+    propertyList.each
    {
        jobName, property->
        if (urlJobName == jobName)
            properties(auxiliary.addCommonProperties(property))
    }

-    jobNameList.each 
+    jobNameList.each
    {
        jobName, nodeDetails->
        if (urlJobName == jobName)
@@ -229,7 +229,7 @@ if ($run_tests); then
        if ($run_tests_all); then
            ./test/UnitTests
        else
-            ./test/UnitTests --gtest_filter="BroadcastCorrectnessSweep*:*float32*"
+            ./test/UnitTests --gtest_filter="AllReduce.*"
        fi
    else
        echo "Unit tests have not been built yet; please re-run script with -t to build unit tests."
@@ -96,6 +96,7 @@ void CliqueManager::CleanUp()
  if (m_cliqueMode == CLIQUE_SINGLE_NODE)
  {
    // Release caches
+    INFO(NCCL_COLL, "Rank %d deleting IPC caches", m_rank);
    if (m_ipcHandleSendCache) delete m_ipcHandleSendCache;
    if (m_ipcHandleRecvCache) delete m_ipcHandleRecvCache;

@@ -494,18 +495,24 @@ ncclResult_t CliqueManager::CheckCacheForPtr(void* devPtr,
  uint64_t realAddr = (uint64_t)devPtr;
  handlePair->second = realAddr - baseAddr;

+  CUDACHECK(hipIpcGetMemHandle(&handlePair->first, (void*)baseAddr));
+
+  /* Disabling cache until proper deallocation methods are available
  // IPC handles are only supported for base address pointers
  NcclIpcHandleSendCache::iterator it = cache->find(baseAddr);

   if (it == cache->end())
   {
+     INFO(NCCL_COLL, "Rank %d searching IPC handle cache for %p (not found)", rank, devPtr);
     CUDACHECK(hipIpcGetMemHandle(&handlePair->first, (void*)baseAddr));
     cache->insert(baseAddr, handlePair->first);
   }
   else
   {
+     INFO(NCCL_COLL, "Rank %d searching IPC handle cache for %p (found!)", rank, devPtr);
     handlePair->first = (it->second).first;
   }
+  */
   return ncclSuccess;
 }

@@ -513,10 +520,16 @@ ncclResult_t CliqueManager::CheckCacheForHandle(std::pair<hipIpcMemHandle_t, siz
                                                NcclIpcHandleRecvCache* cache,
                                                void** ptr)
 {
+  // Until proper deallocation hooks are implemented, receive cache can not be used
+  // Handles will need to be extract each time
+  void* baseAddr;
+  CUDACHECK(hipIpcOpenMemHandle(&baseAddr, handlePair.first, hipIpcMemLazyEnablePeerAccess));
+
+  /*
  NcclIpcHandleRecvCache::iterator it = cache->find(handlePair.first);

  // Get base address pointer from cache if it exists
-  void* baseAddr;
+
  if (it == cache->end())
  {
    CUDACHECK(hipIpcOpenMemHandle(&baseAddr, handlePair.first, hipIpcMemLazyEnablePeerAccess));
@@ -526,6 +539,7 @@ ncclResult_t CliqueManager::CheckCacheForHandle(std::pair<hipIpcMemHandle_t, siz
  {
    baseAddr = (it->second).first;
  }
+  */

  // Modify base address pointer with offset
  uint64_t realAddr = (uint64_t)baseAddr + handlePair.second;
@@ -79,14 +79,14 @@ public:
  static ncclResult_t BootstrapRootInit(int pid, unsigned long hash);

 protected:
-  static ncclResult_t CheckCacheForPtr(void* devPtr,
-                                       NcclIpcHandleSendCache* cache,
-                                       int rank,
-                                       std::pair<hipIpcMemHandle_t, size_t>* handlePair);
+  ncclResult_t CheckCacheForPtr(void* devPtr,
+				NcclIpcHandleSendCache* cache,
+				int rank,
+				std::pair<hipIpcMemHandle_t, size_t>* handlePair);

-  static ncclResult_t CheckCacheForHandle(std::pair<hipIpcMemHandle_t, size_t> const& handlePair,
-                                          NcclIpcHandleRecvCache* cache,
-                                          void** ptr);
+  ncclResult_t CheckCacheForHandle(std::pair<hipIpcMemHandle_t, size_t> const& handlePair,
+				   NcclIpcHandleRecvCache* cache,
+				   void** ptr);

  int                          m_rank;                               // Associated rank
  int                          m_numRanks;                           // Total number of ranks
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(AllGather, InPlace)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollAllGather};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclInt8, ncclInt32, ncclInt64};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {true};
+    std::vector<bool>           const managedMemList = {false};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(AllGather, ManagedMem)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollAllGather};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclUint8, ncclUint32, ncclUint64};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false};
+    std::vector<bool>           const managedMemList = {true};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(AllGather, OutOfPlace)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollAllGather};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclFloat32, ncclFloat64, ncclBfloat16};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false};
+    std::vector<bool>           const managedMemList = {false};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,31 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+#include <cstdlib>
+namespace RcclUnitTesting
+{
+  TEST(AllReduce, Clique)
+  {
+    // Set clique env var prior to TestBed
+    setenv("RCCL_ENABLE_CLIQUE", "1", 1);
+
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollAllReduce};
+    std::vector<ncclDataType_t> const dataTypes      = testBed.GetAllSupportedDataTypes();
+    std::vector<ncclRedOp_t>    const redOps         = testBed.GetAllSupportedRedOps();
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false, true};
+    std::vector<bool>           const managedMemList = {false};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+
+    unsetenv("RCCL_ENABLE_CLIQUE");
+  }
+}
@@ -0,0 +1,63 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(AllReduce, GroupCall)
+  {
+    TestBed testBed;
+
+    // Configuration
+    ncclFunc_t                  const  funcType        = ncclCollAllReduce;
+    std::vector<ncclDataType_t> const& dataTypes       = {ncclFloat};
+    std::vector<ncclRedOp_t>    const& redOps          = {ncclSum};
+    std::vector<int>            const  numElements     = {1048576, 53327, 1024};
+    int                         const  root            = 0;
+    bool                        const  inPlace         = false;
+    bool                        const  useManagedMem   = false;
+    int                         const  numCollPerGroup = numElements.size();
+
+    // This tests runs 3 collectives in the same group call
+    bool isCorrect = true;
+    for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
+    for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
+    {
+      // Test either single process all GPUs, or 1 process per GPU
+      int const numProcesses = isMultiProcess ? totalRanks : 1;
+      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
+
+      for (int redOpIdx = 0; redOpIdx < redOps.size() && isCorrect; ++redOpIdx)
+      for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
+      {
+        if (testBed.ev.showNames)
+          INFO("%s process %2d-ranks AllReduce %d Grouped Calls (%s-%s)\n",
+               isMultiProcess ? "Multi " : "Single",
+               totalRanks, numCollPerGroup,
+               ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]);
+
+        // Run all element sizes in parallel as single group
+        for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx)
+        {
+          testBed.SetCollectiveArgs(funcType,
+                                    dataTypes[dataIdx],
+                                    redOps[redOpIdx],
+                                    root,
+                                    numElements[collIdx],
+                                    numElements[collIdx],
+                                    collIdx);
+        }
+        testBed.AllocateMem(inPlace, useManagedMem);
+        testBed.PrepareData();
+        testBed.ExecuteCollectives();
+        testBed.ValidateResults(isCorrect);
+        testBed.DeallocateMem();
+      }
+      testBed.DestroyComms();
+    }
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(AllReduce, InPlace)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollAllReduce};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclInt8, ncclInt32, ncclFloat32};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum, ncclProd};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {true};
+    std::vector<bool>           const managedMemList = {false};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(AllReduce, ManagedMem)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollAllReduce};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclFloat32, ncclUint8, ncclUint64};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum, ncclMax};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false};
+    std::vector<bool>           const managedMemList = {true};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(AllReduce, OutOfPlace)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollAllReduce};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclFloat32, ncclFloat64, ncclBfloat16};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum, ncclMin};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false};
+    std::vector<bool>           const managedMemList = {false};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,74 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  // This tests using custom pre-mult scalars reductions
+  TEST(AllReduce, PreMultScalar)
+  {
+    TestBed testBed;
+
+    // Configuration
+    ncclFunc_t                  const  funcType      = ncclCollAllReduce;
+    std::vector<ncclDataType_t> const& dataTypes     = {ncclInt32, ncclFloat32, ncclFloat64};
+    ncclRedOp_t                 const  redOp         = ncclSum;
+    std::vector<int>            const  numElements   = {1048576, 1024};
+    int                         const  root          = 0;
+    bool                        const  inPlace       = false;
+    bool                        const  useManagedMem = false;
+
+    // Terminate the test as soon as first failure occurs
+    bool isCorrect = true;
+    for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
+    for (int isMultiProcess = 0; isMultiProcess <= 1; ++isMultiProcess)
+    {
+      int const numProcesses = isMultiProcess ? totalRanks : 1;
+      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
+
+      for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
+      {
+        ncclDataType_t const dataType = dataTypes[dataIdx];
+
+        // Set scalars per rank
+        PtrUnion scalarsPerRank;
+        scalarsPerRank.AllocateCpuMem(totalRanks * DataTypeToBytes(dataType));
+        for (int i = 0;  i < totalRanks; i++)
+        {
+          double F = i;
+          scalarsPerRank.Set(dataType, i, i, F);
+        }
+
+        // Test various scalar residence modes
+        for (int scalarMode = 0; scalarMode <= 1 && isCorrect; ++scalarMode)
+        {
+          if (testBed.ev.showNames)
+            INFO("%s process %2d-ranks AllReduce (custom-scalar Mode %d %s)\n",
+                 isMultiProcess ? "Multi " : "Single",
+                 totalRanks, scalarMode, ncclDataTypeNames[dataType]);
+
+          for (int i = 0; i < numElements.size() && isCorrect; ++i)
+          {
+            testBed.SetCollectiveArgs(funcType, dataType, redOp, root,
+                                      numElements[i], numElements[i],
+                                      -1, -1, scalarsPerRank, scalarMode);
+            // For performance, only allocate and prepare data on largest size
+            if (i == 0)
+            {
+              testBed.AllocateMem(inPlace, useManagedMem);
+              testBed.PrepareData();
+            }
+            testBed.ExecuteCollectives();
+            testBed.ValidateResults(isCorrect);
+          }
+          testBed.DeallocateMem();
+        }
+      }
+      testBed.DestroyComms();
+    }
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(AllToAll, ManagedMem)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollAllToAll};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclUint8, ncclUint32, ncclUint64};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false};
+    std::vector<bool>           const managedMemList = {true};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(AllToAll, OutOfPlace)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollAllToAll};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclFloat32, ncclFloat64, ncclBfloat16};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false};
+    std::vector<bool>           const managedMemList = {false};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(Broadcast, InPlace)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollBroadcast};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclInt8, ncclInt32, ncclInt64};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {true};
+    std::vector<bool>           const managedMemList = {false};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(Broadcast, ManagedMem)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollBroadcast};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclUint8, ncclUint32, ncclUint64};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false};
+    std::vector<bool>           const managedMemList = {true};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(Broadcast, OutOfPlace)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollBroadcast};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclFloat32, ncclFloat64, ncclBfloat16};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum};
+    std::vector<int>            const roots          = {1};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false};
+    std::vector<bool>           const managedMemList = {false};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -3,87 +3,103 @@ cmake_minimum_required(VERSION 2.8.12)

 if(BUILD_TESTS)

-  message("Going to build unit tests (Installed in /test/UnitTests)")
+  message("Building unit tests (Installed in /test/UnitTests)")

  find_program(CHRPATH chrpath)
  if(NOT CHRPATH)
      message(FATAL_ERROR "chrpath is required for UnitTests. Please install (e.g. sudo apt-get install chrpath)")
  endif()

-  include_directories(${GTEST_INCLUDE_DIRS})
+  find_package(hsa-runtime64 PATHS /opt/rocm )
+  if(${hsa-runtime64_FOUND})
+    message("hsa-runtime64 found @  ${hsa-runtime64_DIR} ")
+  else()
+    message("find_package did NOT find hsa-runtime64, finding it the OLD Way")
+    message("Looking for header files in ${ROCR_INC_DIR}")
+    message("Looking for library files in ${ROCR_LIB_DIR}")

+    # Search for ROCr header file in user defined locations
+    find_path(ROCR_HDR hsa.h PATHS ${ROCR_INC_DIR} "/opt/rocm" PATH_SUFFIXES include/hsa REQUIRED)
+    INCLUDE_DIRECTORIES(${ROCR_HDR})
+
+    # Search for ROCr library file in user defined locations
+    find_library(ROCR_LIB ${CORE_RUNTIME_TARGET} PATHS ${ROCR_LIB_DIR} "/opt/rocm" PATH_SUFFIXES lib lib64 REQUIRED)
+  endif()
+
+  include_directories(${GTEST_INCLUDE_DIRS} ./common)
+
+  # Collect testing framework source files
+  set (COMMON_SOURCE_FILES
+    common/main.cpp
+    common/CollectiveArgs.cpp
+    common/EnvVars.cpp
+    common/PrepDataFuncs.cpp
+    common/PtrUnion.cpp
+    common/TestBed.cpp
+    common/TestBedChild.cpp
+    )
+
+  # Collect source files for tests
  if(BUILD_ALLREDUCE_ONLY)
-    set(TEST_SOURCES_SINGLE_PROCESS
-      test_AllReduce.cpp
-      test_AllReduceAbort.cpp
-      test_AllReduceGroup.cpp
+    set(TEST_SOURCE_FILES
+      AllReduce_Clique.cpp
+      AllReduce_GroupCall.cpp
+      AllReduce_InPlace.cpp
+      AllReduce_ManagedMem.cpp
+      AllReduce_OutOfPlace.cpp
+      AllReduce_PreMultScalar.cpp
    )
  else()
-    # Collect source files for tests
-    set(TEST_SOURCES_SINGLE_PROCESS
-      test_AllGather.cpp
-      test_AllReduce.cpp
-      test_AllReduceGroup.cpp
-      test_Broadcast.cpp
-      test_Reduce.cpp
-      test_ReduceScatter.cpp
-      test_GroupCalls.cpp
-      test_CombinedCalls.cpp
-      test_AllReduceAbort.cpp
-      test_BroadcastAbort.cpp
-      test_Scatter.cpp
-      test_Gather.cpp
-      test_AllToAll.cpp
-      test_AllToAllv.cpp
-    )
+    set(TEST_SOURCE_FILES
+      #AllReduce
+      AllReduce_Clique.cpp
+      AllReduce_GroupCall.cpp
+      AllReduce_InPlace.cpp
+      AllReduce_ManagedMem.cpp
+      AllReduce_OutOfPlace.cpp
+      AllReduce_PreMultScalar.cpp
+      #AllGather
+      AllGather_InPlace.cpp
+      AllGather_ManagedMem.cpp
+      AllGather_OutOfPlace.cpp
+      #AllToAll
+      AllToAll_OutOfPlace.cpp
+      AllToAll_ManagedMem.cpp
+      #Broadcast
+      Broadcast_InPlace.cpp
+      Broadcast_ManagedMem.cpp
+      Broadcast_OutOfPlace.cpp
+      #Reduce
+      Reduce_InPlace.cpp
+      Reduce_ManagedMem.cpp
+      Reduce_OutOfPlace.cpp
+      #ReduceScatter
+      ReduceScatter_InPlace.cpp
+      ReduceScatter_ManagedMem.cpp
+      ReduceScatter_OutOfPlace.cpp
+      #Scatter
+      Scatter_InPlace.cpp
+      Scatter_ManagedMem.cpp
+      Scatter_OutOfPlace.cpp
+      #Gather
+      Gather_InPlace.cpp
+      Gather_ManagedMem.cpp
+      Gather_OutOfPlace.cpp
+      )
  endif()
-  if(BUILD_ALLREDUCE_ONLY)
-    set(TEST_SOURCES_MULTI_PROCESS
-      test_AllReduceMultiProcess.cpp
-      test_AllReduceGroupMultiProcess.cpp
-    )
-  else()
-    set(TEST_SOURCES_MULTI_PROCESS
-      test_AllGatherMultiProcess.cpp
-      test_AllReduceMultiProcess.cpp
-      test_AllReduceGroupMultiProcess.cpp
-      test_AllToAllMultiProcess.cpp
-      test_BroadcastMultiProcess.cpp
-      test_CombinedCallsMultiProcess.cpp
-      test_GatherMultiProcess.cpp
-      test_GroupCallsMultiProcess.cpp
-      test_ReduceMultiProcess.cpp
-      test_ReduceScatterMultiProcess.cpp
-      test_ScatterMultiProcess.cpp
-    )
-  endif()
-  add_executable(UnitTests ${TEST_SOURCES_SINGLE_PROCESS})
+
+  add_executable(UnitTests ${COMMON_SOURCE_FILES} ${TEST_SOURCE_FILES})
  target_include_directories(UnitTests PRIVATE ${ROCM_PATH} ${GTEST_INCLUDE_DIRS})
  target_link_libraries(UnitTests PRIVATE ${GTEST_BOTH_LIBRARIES})
-  target_link_libraries(UnitTests PRIVATE hip::host hip::device)
-
-  add_executable(UnitTestsMultiProcess ${TEST_SOURCES_MULTI_PROCESS})
-  target_include_directories(UnitTestsMultiProcess PRIVATE ${ROCM_PATH} ${GTEST_INCLUDE_DIRS})
-  target_link_libraries(UnitTestsMultiProcess PRIVATE ${GTEST_BOTH_LIBRARIES})
-  target_link_libraries(UnitTestsMultiProcess PRIVATE hip::host hip::device)
-
-  find_program( rocminfo_executable rocminfo )
-  execute_process(COMMAND bash "-c" "${rocminfo_executable} | grep 'Device Type' | grep GPU | wc -l | tr -d '\n'" OUTPUT_VARIABLE gtest_num_gpus)
-  if(${gtest_num_gpus} EQUAL "0" OR ${gtest_num_gpus} EQUAL "1")
-    set(gtest_num_gpus "2")
-  endif()
-  target_compile_options(UnitTests PRIVATE -DGTESTS_NUM_GPUS=${gtest_num_gpus})
+  target_link_libraries(UnitTests PRIVATE hip::host hip::device hsa-runtime64::hsa-runtime64)

  # UnitTests using static library of rccl requires passing rccl
  # through -l and -L instead of command line input.
  if(BUILD_STATIC)
    add_dependencies(UnitTests rccl)
    target_link_libraries(UnitTests PRIVATE dl rt numa -lrccl -L${CMAKE_BINARY_DIR} -lrocm_smi64 -L${ROCM_PATH}/rocm_smi/lib)
-    add_dependencies(UnitTestsMultiProcess rccl)
-    target_link_libraries(UnitTestsMultiProcess PRIVATE dl rt numa -lrccl -L${CMAKE_BINARY_DIR} -lrocm_smi64 -L${ROCM_PATH}/rocm_smi/lib)
  else()
    target_link_libraries(UnitTests PRIVATE rccl)
-    target_link_libraries(UnitTestsMultiProcess PRIVATE rt rccl)
  endif()
  # HIPCC adds /opt/rocm/lib as RPATH, even though the install process is supposed to
  # remove RPATH.  It also occurs before any user-specified rpath, which effectively overrides the user rpath.
@@ -91,10 +107,8 @@ if(BUILD_TESTS)
  if (CMAKE_INSTALL_PREFIX MATCHES "${ROCM_PATH}")
    # install_prefix/CMAKE_INSTALL_PREFIX was not explicitly specified, so look in build/release
    add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:${ROCM_PATH}/lib ${CMAKE_BINARY_DIR}/test/UnitTests)
-    add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_BINARY_DIR}:${ROCM_PATH}/lib ${CMAKE_BINARY_DIR}/test/UnitTestsMultiProcess)
  else()
    add_custom_command( TARGET UnitTests POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTests)
-    add_custom_command( TARGET UnitTestsMultiProcess POST_BUILD COMMAND chrpath ARGS -r ${CMAKE_INSTALL_PREFIX}/lib:${ROCM_PATH}/lib ${CMAKE_INSTALL_PREFIX}/test/UnitTestsMultiProcess)
  endif()
 else()
  message("Not building unit tests")
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(Gather, InPlace)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollGather};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclInt8, ncclInt32, ncclInt64};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {true};
+    std::vector<bool>           const managedMemList = {false};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(Gather, ManagedMem)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollGather};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclUint8, ncclUint32, ncclUint64};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum};
+    std::vector<int>            const roots          = {1};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false};
+    std::vector<bool>           const managedMemList = {true};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(Gather, OutOfPlace)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollGather};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclFloat32, ncclFloat64, ncclBfloat16};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum};
+    std::vector<int>            const roots          = {1};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false};
+    std::vector<bool>           const managedMemList = {false};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(ReduceScatter, InPlace)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollReduceScatter};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclInt8, ncclInt32, ncclInt64};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum, ncclProd};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {true};
+    std::vector<bool>           const managedMemList = {false};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(ReduceScatter, ManagedMem)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollReduceScatter};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclUint8, ncclUint32, ncclUint64};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false};
+    std::vector<bool>           const managedMemList = {true};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(ReduceScatter, OutOfPlace)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollReduceScatter};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclFloat32, ncclFloat64, ncclBfloat16};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclMin, ncclMax, ncclAvg};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false};
+    std::vector<bool>           const managedMemList = {false};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(Reduce, InPlace)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollReduce};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclInt8, ncclInt32, ncclInt64};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum, ncclProd};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {true};
+    std::vector<bool>           const managedMemList = {false};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(Reduce, ManagedMem)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollReduce};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclUint8, ncclUint32, ncclUint64};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false};
+    std::vector<bool>           const managedMemList = {true};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(Reduce, OutOfPlace)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollReduce};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclFloat32, ncclFloat64, ncclBfloat16};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclMin, ncclMax, ncclAvg};
+    std::vector<int>            const roots          = {1};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false};
+    std::vector<bool>           const managedMemList = {false};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(Scatter, InPlace)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollScatter};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclInt8, ncclInt32, ncclInt64};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {true};
+    std::vector<bool>           const managedMemList = {false};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(Scatter, ManagedMem)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollScatter};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclUint8, ncclUint32, ncclUint64};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum};
+    std::vector<int>            const roots          = {0};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false};
+    std::vector<bool>           const managedMemList = {true};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+  TEST(Scatter, OutOfPlace)
+  {
+    TestBed testBed;
+
+    // Configuration
+    std::vector<ncclFunc_t>     const funcTypes      = {ncclCollScatter};
+    std::vector<ncclDataType_t> const dataTypes      = {ncclFloat32, ncclFloat64, ncclBfloat16};
+    std::vector<ncclRedOp_t>    const redOps         = {ncclSum};
+    std::vector<int>            const roots          = {1};
+    std::vector<int>            const numElements    = {1048576, 53327, 1024};
+    std::vector<bool>           const inPlaceList    = {false};
+    std::vector<bool>           const managedMemList = {false};
+
+    testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, inPlaceList, managedMemList);
+    testBed.Finalize();
+  }
+}
@@ -1,63 +0,0 @@
-#ifndef TESTCHECKS_HPP
-#define TESTCHECKS_HPP
-
-#define HIP_CALL(x) ASSERT_EQ(x, hipSuccess)
-#define NCCL_CALL(x) ASSERT_EQ(x, ncclSuccess)
-
-#define SYSCHECK_TEST(call, name) do { \
-  int retval; \
-  SYSCHECKVAL_TEST(call, name, retval); \
-} while (false)
-
-#define SYSCHECKVAL_TEST(call, name, retval) do { \
-  SYSCHECKSYNC_TEST(call, name, retval); \
-  if (retval == -1) { \
-    printf("Call to %s failed : %s\n", name, strerror(errno)); \
-    fflush(stdout); \
-    return ncclSystemError; \
-  } \
-} while (false)
-
-#define SYSCHECK_GOTO_TEST(call, name, label) do { \
-  int retval; \
-  SYSCHECKVAL_GOTO_TEST(call, name, retval, label); \
-} while (false)
-
-#define SYSCHECKVAL_GOTO_TEST(call, name, retval, label) do { \
-  SYSCHECKSYNC_TEST(call, name, retval); \
-  if (retval == -1) { \
-    printf("Call to %s failed : %s\n", name, strerror(errno)); \
-    fflush(stdout); \
-    goto label; \
-  } \
-} while (false)
-
-#define SYSCHECKSYNC_TEST(call, name, retval) do { \
-  retval = call; \
-  if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
-  } else { \
-    break; \
-  } \
-} while(true)
-
-#define NCCLCHECK_BARRIER_TEST(call, name, rank) do { \
-  ncclResult_t retval; \
-  retval = call; \
-  if (retval != ncclSuccess) { \
-        printf("Rank %d call to %s failed : %s\n", rank, name, strerror(errno)); \
-        fflush(stdout); \
-        return; \
-  } \
-} while (false)
-
-#define NCCLCHECK_TEST(call, name) do { \
-  ncclResult_t retval; \
-  retval = call; \
-  if (retval != ncclSuccess) { \
-        printf("Call to %s failed : %s\n", name, strerror(errno)); \
-        fflush(stdout); \
-        return retval; \
-  } \
-} while (false)
-
-#endif
@@ -0,0 +1,282 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "CollectiveArgs.hpp"
+#include "gtest/gtest.h"
+
+namespace RcclUnitTesting
+{
+  ErrCode CollectiveArgs::SetArgs(int             const  globalRank,
+                                  int             const  totalRanks,
+                                  int             const  deviceId,
+                                  ncclFunc_t      const  funcType,
+                                  ncclDataType_t  const  dataType,
+                                  ncclRedOp_t     const  redOp,
+                                  int             const  root,
+                                  size_t          const  numInputElements,
+                                  size_t          const  numOutputElements,
+                                  ScalarTransport const  scalarTransport,
+                                  int             const  scalarMode)
+  {
+    // Free scalar based on previous scalarMode
+    if (scalarMode != -1)
+    {
+      if (this->localScalar.ptr != nullptr)
+      {
+        if (this->scalarMode == 0) this->localScalar.FreeGpuMem();
+        if (this->scalarMode == 1) hipHostFree(this->localScalar.ptr);
+      }
+    }
+
+    this->globalRank        = globalRank;
+    this->totalRanks        = totalRanks;
+    this->deviceId          = deviceId;
+    this->funcType          = funcType;
+    this->dataType          = dataType;
+    this->redOp             = redOp;
+    this->root              = root;
+    this->numInputElements  = numInputElements;
+    this->numOutputElements = numOutputElements;
+    this->scalarTransport   = scalarTransport;
+    this->scalarMode        = scalarMode;
+
+    if (scalarMode != -1)
+    {
+      size_t const numBytes = DataTypeToBytes(dataType);
+      if (scalarMode == ncclScalarDevice)
+      {
+        CHECK_CALL(this->localScalar.AllocateGpuMem(numBytes));
+        CHECK_HIP(hipMemcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes),
+                            numBytes, hipMemcpyHostToDevice));
+      }
+      else if (scalarMode == ncclScalarHostImmediate)
+      {
+        CHECK_HIP(hipHostMalloc(&this->localScalar.ptr, numBytes, 0));
+        memcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes), numBytes);
+      }
+    }
+    return TEST_SUCCESS;
+  }
+
+  ErrCode CollectiveArgs::AllocateMem(bool   const inPlace,
+                                      bool   const useManagedMem)
+  {
+    this->numInputBytesAllocated     = this->numInputElements * DataTypeToBytes(this->dataType);
+    this->numOutputBytesAllocated    = this->numOutputElements * DataTypeToBytes(this->dataType);
+    this->numInputElementsAllocated  = this->numInputElements;
+    this->numOutputElementsAllocated = this->numOutputElements;
+    this->inPlace                    = inPlace;
+    this->useManagedMem              = useManagedMem;
+
+    if (hipSetDevice(this->deviceId) != hipSuccess)
+    {
+      ERROR("Unable to call hipSetDevice to set to GPU %d\n", this->deviceId);
+      return TEST_FAIL;
+    }
+
+    if (inPlace)
+    {
+      if (this->funcType == ncclCollScatter)
+      {
+        CHECK_CALL(this->inputGpu.AllocateGpuMem(this->numInputBytesAllocated, useManagedMem));
+        this->outputGpu.Attach(this->inputGpu.U1 + (this->globalRank  * this->numOutputBytesAllocated));
+      }
+      else if (this->funcType == ncclCollGather)
+      {
+        CHECK_CALL(this->outputGpu.AllocateGpuMem(this->numOutputBytesAllocated, useManagedMem));
+        this->inputGpu.Attach(this->outputGpu.U1 + (this->globalRank * this->numInputBytesAllocated));
+      }
+      else
+      {
+        size_t const numBytes = std::max(this->numInputBytesAllocated, this->numOutputBytesAllocated);
+        CHECK_CALL(this->inputGpu.AllocateGpuMem(numBytes, useManagedMem));
+        this->outputGpu.Attach(this->inputGpu.ptr);
+      }
+      CHECK_CALL(this->expected.AllocateCpuMem(this->numOutputBytesAllocated));
+    }
+    else
+    {
+      CHECK_CALL(this->inputGpu.AllocateGpuMem(this->numInputBytesAllocated, useManagedMem));
+      CHECK_CALL(this->outputGpu.AllocateGpuMem(this->numOutputBytesAllocated, useManagedMem));
+      CHECK_CALL(this->expected.AllocateCpuMem(this->numOutputBytesAllocated));
+    }
+    CHECK_CALL(this->outputCpu.AllocateCpuMem(this->numOutputBytesAllocated));
+    return TEST_SUCCESS;
+  }
+
+  ErrCode CollectiveArgs::PrepareData(CollFuncPtr const prepareDataFunc)
+  {
+    CollFuncPtr prepFunc = (prepareDataFunc == nullptr ? DefaultPrepareDataFunc : prepareDataFunc);
+    return prepFunc(*this);
+  }
+
+  ErrCode CollectiveArgs::ValidateResults()
+  {
+    // Ignore non-root outputs for collectives with a root
+    if (CollectiveArgs::UsesRoot(this->funcType) && this->root != this->globalRank) return TEST_SUCCESS;
+
+    size_t const numOutputBytes = (this->numOutputElements * DataTypeToBytes(this->dataType));
+
+    CHECK_HIP(hipMemcpy(this->outputCpu.ptr, this->outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost));
+
+    bool isMatch = true;
+    CHECK_CALL(this->outputCpu.IsEqual(this->dataType,
+                                       this->numOutputElements,
+                                       this->expected,
+                                       true,
+                                       isMatch));
+    if (!isMatch) ERROR("Mismatch for %s\n", this->GetDescription().c_str());
+    return isMatch ? TEST_SUCCESS : TEST_FAIL;
+  }
+
+  ErrCode CollectiveArgs::DeallocateMem()
+  {
+    // If in-place, either only inputGpu or outputGpu was allocated
+    if (this->inPlace)
+    {
+      if (this->funcType == ncclCollGather)
+        this->outputGpu.FreeGpuMem();
+      else
+        this->inputGpu.FreeGpuMem();
+    }
+    else
+    {
+      this->inputGpu.FreeGpuMem();
+      this->outputGpu.FreeGpuMem();
+    }
+
+    this->outputCpu.FreeCpuMem();
+    this->expected.FreeCpuMem();
+
+    if (this->localScalar.ptr != nullptr)
+    {
+      if (this->scalarMode == 0) this->localScalar.FreeGpuMem();
+      if (this->scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr));
+    }
+    return TEST_SUCCESS;
+  }
+
+  std::string CollectiveArgs::GetDescription() const
+  {
+    std::stringstream ss;
+
+    ss << "(Rank " << this->globalRank << ") ";
+    switch (this->funcType)
+    {
+    case ncclCollBroadcast:     ss << "ncclBroadcast";     break;
+    case ncclCollReduce:        ss << "ncclReduce";        break;
+    case ncclCollAllGather:     ss << "ncclAllGather";     break;
+    case ncclCollReduceScatter: ss << "ncclReduceScatter"; break;
+    case ncclCollAllReduce:     ss << "ncclAllReduce";     break;
+    case ncclCollGather:        ss << "ncclGather";        break;
+    case ncclCollScatter:       ss << "ncclScatter";       break;
+    case ncclCollAllToAll:      ss << "ncclAllToAll";      break;
+    case ncclCollSend:          ss << "ncclSend";          break;
+    case ncclCollRecv:          ss << "ncclRevv";          break;
+    default:                    ss << "[Unknown]";         break;
+    }
+
+    ss << " " << ncclDataTypeNames[this->dataType] << " ";
+    if (this->funcType == ncclCollReduce ||
+        this->funcType == ncclCollReduceScatter ||
+        this->funcType == ncclCollAllReduce)
+    {
+      if (this->redOp < ncclNumOps)
+      {
+        ss << ncclRedOpNames[this->redOp] << " ";
+      }
+      else
+      {
+        ss << "CustomScalar ";
+        PtrUnion scalarsPerRank;
+        scalarsPerRank.Attach(scalarsPerRank.ptr);
+        switch (this->dataType)
+        {
+        case ncclInt8:     ss << scalarsPerRank.I1[this->globalRank]; break;
+        case ncclUint8:    ss << scalarsPerRank.U1[this->globalRank]; break;
+        case ncclInt32:    ss << scalarsPerRank.I4[this->globalRank]; break;
+        case ncclUint32:   ss << scalarsPerRank.U4[this->globalRank]; break;
+        case ncclInt64:    ss << scalarsPerRank.I8[this->globalRank]; break;
+        case ncclUint64:   ss << scalarsPerRank.U8[this->globalRank]; break;
+        case ncclFloat32:  ss << scalarsPerRank.F4[this->globalRank]; break;
+        case ncclFloat64:  ss << scalarsPerRank.F8[this->globalRank]; break;
+        case ncclBfloat16: ss << scalarsPerRank.B2[this->globalRank]; break;
+        default:           ss << "(UNKNOWN)";
+        }
+        ss << " ";
+      }
+    }
+
+    if (this->funcType == ncclCollBroadcast ||
+        this->funcType == ncclCollReduce ||
+        this->funcType == ncclCollGather ||
+        this->funcType == ncclCollScatter)
+    {
+      ss << "Root " << this->root << " ";
+    }
+
+    if (this->funcType == ncclCollSend ||
+        this->funcType == ncclCollRecv)
+    {
+      ss << "Peer " << this->root << " ";
+    }
+
+    ss << "#In: " << this->numInputElements;
+    ss << " #Out: " << this->numOutputElements;
+
+    return ss.str();
+  }
+
+  void CollectiveArgs::GetNumElementsForFuncType(ncclFunc_t const funcType,
+                                                 int        const N,
+                                                 int        const totalRanks,
+                                                 int*             numInputElements,
+                                                 int*             numOutputElements)
+  {
+    switch (funcType)
+    {
+    case ncclCollBroadcast:
+    case ncclCollReduce:
+    case ncclCollAllReduce:
+      *numInputElements  = N;
+      *numOutputElements = N;
+      break;
+    case ncclCollGather:
+    case ncclCollAllGather:
+      *numInputElements  = N;
+      *numOutputElements = totalRanks * N;
+      break;
+    case ncclCollScatter:
+    case ncclCollReduceScatter:
+      *numInputElements  = totalRanks * N;
+      *numOutputElements = N;
+      break;
+    case ncclCollAllToAll:
+      *numInputElements = totalRanks * N;
+      *numOutputElements = totalRanks * N;
+      break;
+    default:
+      *numInputElements = N;
+      *numOutputElements = N;
+      break;
+    }
+  }
+
+  bool CollectiveArgs::UsesReduce(ncclFunc_t const funcType)
+  {
+    return (funcType == ncclCollReduce    ||
+            funcType == ncclCollAllReduce ||
+            funcType == ncclCollReduceScatter);
+  }
+
+  bool CollectiveArgs::UsesRoot(ncclFunc_t const funcType)
+  {
+    return (funcType == ncclCollBroadcast ||
+            funcType == ncclCollReduce    ||
+            funcType == ncclCollGather    ||
+            funcType == ncclCollScatter);
+  }
+}
@@ -0,0 +1,151 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+b
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#pragma once
+#include "PtrUnion.hpp"
+#include "PrepDataFuncs.hpp"
+#include "rccl.h"
+
+namespace RcclUnitTesting
+{
+  // Enumeration of all collective functions currently supported
+  typedef enum
+  {
+    ncclCollBroadcast = 0,
+    ncclCollReduce,
+    ncclCollAllGather,
+    ncclCollReduceScatter,
+    ncclCollAllReduce,
+    ncclCollGather,
+    ncclCollScatter,
+    ncclCollAllToAll,
+    ncclCollSend,
+    ncclCollRecv,
+    ncclNumFuncs
+  } ncclFunc_t;
+
+  char const ncclFuncNames[ncclNumFuncs][32] =
+  {
+    "Broadcast",
+    "Reduce",
+    "AllGather",
+    "ReduceScatter",
+    "AllReduce",
+    "Gather",
+    "Scatter",
+    "AllToAll",
+    "Send",
+    "Recv"
+  };
+
+  char const ncclDataTypeNames[ncclNumTypes][32] =
+  {
+    "ncclInt8",
+    "ncclUint8",
+    "ncclInt32",
+    "ncclUint32",
+    "ncclInt64",
+    "ncclUint64",
+    "ncclFloat16",
+    "ncclFloat32",
+    "ncclFloat64",
+    "ncclBfloat16"
+  };
+
+  char const ncclRedOpNames[ncclNumOps][32] =
+  {
+    "sum",
+    "prod",
+    "max",
+    "min",
+    "avg"
+  };
+
+  class CollectiveArgs;
+
+  #define MAX_RANKS 32
+  struct ScalarTransport
+  {
+    char ptr[MAX_RANKS * sizeof(double)];
+  };
+
+  // Function pointer for functions that operate on CollectiveArgs
+  // e.g. For filling input / computing expected results
+  typedef ErrCode (*CollFuncPtr)(CollectiveArgs &);
+
+  class CollectiveArgs
+  {
+  public:
+    // Arguments to execute
+    int             globalRank;
+    int             totalRanks;
+    int             deviceId;
+    ncclFunc_t      funcType;
+    ncclDataType_t  dataType;
+    ncclRedOp_t     redOp;
+    int             root;              // Used as "peer" for Send/Recv
+    size_t          numInputElements;
+    size_t          numOutputElements;
+    ScalarTransport scalarTransport;   // Used for custom reduction operators
+    PtrUnion        localScalar;
+    int             scalarMode;        // -1 if scalar not used
+
+    // Data
+    PtrUnion       inputGpu;
+    PtrUnion       outputGpu;
+    PtrUnion       outputCpu;
+    PtrUnion       expected;
+    bool           inPlace;
+    bool           useManagedMem;
+    size_t         numInputBytesAllocated;
+    size_t         numOutputBytesAllocated;
+    size_t         numInputElementsAllocated;
+    size_t         numOutputElementsAllocated;
+
+    // Set collective arguments
+    ErrCode SetArgs(int             const globalRank,
+                    int             const totalRanks,
+                    int             const deviceId,
+                    ncclFunc_t      const funcType,
+                    ncclDataType_t  const dataType,
+                    ncclRedOp_t     const redOp,
+                    int             const root,
+                    size_t          const numInputElements,
+                    size_t          const numOutputElements,
+                    ScalarTransport const scalarsPerRank,
+                    int             const scalarMode = -1);
+
+    // Allocates GPU memory for input/output and CPU memory for expected
+    // When inPlace is true, input and output share the same memory
+    ErrCode AllocateMem(bool   const inPlace,
+                        bool   const useManagedMem);
+
+    // Execute the provided data preparation function to fill input and compute expected results
+    ErrCode PrepareData(CollFuncPtr const prepareDataFunc);
+
+    // Compare outputs to expected values
+    ErrCode ValidateResults();
+
+    // Deallocate memory
+    ErrCode DeallocateMem();
+
+    // Provide a description for the current collective arguments
+    std::string GetDescription() const;
+
+    // Returns the number of inputs/outputs based on collective function type
+    static void GetNumElementsForFuncType(ncclFunc_t const funcType,
+                                          int        const N,
+                                          int        const totalRanks,
+                                          int*             numInputElements,
+                                          int*             numOutputElements);
+
+    // Returns true if collective function performs reduction
+    static bool UsesReduce(ncclFunc_t const funcType);
+
+    // Returns true if collective function utilizes a root rank
+    static bool UsesRoot(ncclFunc_t const funcType);
+  };
+}
@@ -0,0 +1,161 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "EnvVars.hpp"
+#include "CollectiveArgs.hpp"
+#include <cstdlib>
+
+namespace RcclUnitTesting
+{
+  int const UT_SINGLE_PROCESS = (1<<0);
+  int const UT_MULTI_PROCESS  = (1<<1);
+
+  hsa_status_t CountGpus(hsa_agent_t agent, void* data)
+  {
+    int* currCount = (int*)data;
+    hsa_device_type_t device;
+    hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device);
+    if (device == HSA_DEVICE_TYPE_GPU)
+      *currCount = *currCount + 1;
+    return HSA_STATUS_SUCCESS;
+  }
+
+  EnvVars::EnvVars()
+  {
+    // Collect number of GPUs available
+    // NOTE: Cannot use HIP call prior to launching child processes via fork so use HSA
+    int numDevicesAvailable = 0;
+    hsa_init();
+    hsa_iterate_agents(CountGpus, &numDevicesAvailable);
+    hsa_shut_down();
+
+    showNames   = GetEnvVar("UT_SHOW_NAMES"  , 1);
+    minGpus     = GetEnvVar("UT_MIN_GPUS"    , 2);
+    maxGpus     = GetEnvVar("UT_MAX_GPUS"    , numDevicesAvailable);
+    processMask = GetEnvVar("UT_PROCESS_MASK", UT_SINGLE_PROCESS | UT_MULTI_PROCESS);
+    verbose     = GetEnvVar("UT_VERBOSE"     , 0);
+    printValues = GetEnvVar("UT_PRINT_VALUES", 0);
+
+    // Limit number of supported reduction operators to just ncclSum if only allReduce is built
+#ifdef BUILD_ALLREDUCE_ONLY
+    int numOps = 1;
+#else
+    int numOps = ncclNumOps;
+#endif
+    std::vector<std::string> redOpStrings = GetEnvVarsList("UT_REDOPS");
+    for (auto s : redOpStrings)
+    {
+      for (int i = 0; i < numOps; ++i)
+      {
+        if (!strcmp(s.c_str(), ncclRedOpNames[i]))
+        {
+          redOps.push_back((ncclRedOp_t)i);
+          break;
+        }
+      }
+    }
+    // Default back to all ops if no strings are found
+    if (redOps.empty())
+    {
+      for (int i = 0; i < numOps; i++)
+        redOps.push_back((ncclRedOp_t)i);
+    }
+
+    // Limit number of supported datatypes if only allReduce is built
+    std::vector<std::string> dtStrings = GetEnvVarsList("UT_DATATYPES");
+    for (auto s : dtStrings)
+    {
+      for (int i = 0; i < ncclNumTypes; ++i)
+      {
+        if (!strcmp(s.c_str(), ncclDataTypeNames[i]))
+        {
+#ifdef BUILD_ALLREDUCE_ONLY
+          if (i == ncclFloat32)
+#endif
+          {
+            dataTypes.push_back((ncclDataType_t)i);
+          }
+        }
+      }
+    }
+
+    // Default option if no valid datatypes are found in env var
+    if (dataTypes.empty())
+    {
+      dataTypes.push_back(ncclFloat32);
+      // Skip all but 32-bit floats if only AllReduce is being built
+#ifndef BUILD_ALLREDUCE_ONLY
+      dataTypes.push_back(ncclInt8);
+      dataTypes.push_back(ncclUint8);
+      dataTypes.push_back(ncclInt32);
+      dataTypes.push_back(ncclUint32);
+      dataTypes.push_back(ncclInt64);
+      dataTypes.push_back(ncclUint64);
+      // Half-precision floats disabled due to lack of host-side support
+      // dataTypes.push_back(ncclFloat16);
+      dataTypes.push_back(ncclFloat32);
+      dataTypes.push_back(ncclFloat64);
+      dataTypes.push_back(ncclBfloat16);
+#endif
+    }
+  }
+
+  std::vector<ncclRedOp_t> const& EnvVars::GetAllSupportedRedOps()
+  {
+    return redOps;
+  }
+
+  std::vector<ncclDataType_t> const& EnvVars::GetAllSupportedDataTypes()
+  {
+    return dataTypes;
+  }
+
+  int EnvVars::GetEnvVar(std::string const varname, int defaultValue)
+  {
+    if (getenv(varname.c_str()))
+      return atoi(getenv(varname.c_str()));
+    return defaultValue;
+  };
+
+  std::vector<std::string> EnvVars::GetEnvVarsList(std::string const varname)
+  {
+    std::vector<std::string> result;
+    if (getenv(varname.c_str()))
+    {
+      char* token = strtok(getenv(varname.c_str()), ",;");
+      while (token != NULL)
+      {
+        result.push_back(token);
+        token = strtok(NULL, ",;");
+      }
+    }
+    return result;
+  }
+
+  void EnvVars::ShowConfig()
+  {
+    std::vector<std::pair<std::string, std::string>> supported =
+      {
+        std::make_pair("UT_SHOW_NAMES"  , "Show test case names"),
+        std::make_pair("UT_MIN_GPUS"    , "Minimum number of GPUs to use"),
+        std::make_pair("UT_MAX_GPUS"    , "Maximum number of GPUs to use"),
+        std::make_pair("UT_PROCESS_MASK", "Whether to run single/multi process"),
+        std::make_pair("UT_VERBOSE"     , "Show verbose unit test output"),
+        std::make_pair("UT_REDOPS"      , "List of reduction ops to test"),
+        std::make_pair("UT_DATATYPES"   , "List of datatypes to test"),
+        std::make_pair("UT_PRINT_VALUES", "Print array values (# of values to print, < 0 for all)")
+      };
+
+    printf("================================================================================\n");
+    printf(" Environment variables:\n");
+    for (auto p : supported)
+    {
+      printf(" - %-20s %-40s %s\n", p.first.c_str(), p.second.c_str(),
+             getenv(p.first.c_str()) ? getenv(p.first.c_str()) : "<unset>");
+    }
+    printf("================================================================================\n");
+  }
+}
@@ -0,0 +1,44 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#pragma once
+#include <hsa/hsa.h>
+#include <vector>
+#include "rccl.h"
+
+namespace RcclUnitTesting
+{
+  // Helper function to count the number of GPUs on system
+  static hsa_status_t CountGpus(hsa_agent_t agent, void* data);
+
+  // Helper class to track environment variables that affect the unit tests
+  class EnvVars
+  {
+  public:
+    bool showNames;   // List test case names during run        [UT_SHOW_NAMES]
+    int  minGpus;     // Set the minimum number of GPUs to use  [UT_MIN_GPUS]
+    int  maxGpus;     // Set the maximum number of GPUs to use  [UT_MAX_GPUS]
+    int  processMask; // Filter single/multi process            [UT_PROCESS_MASK]
+    bool verbose;     // Show verbose TestBed output for debug  [UT_VERBOSE]
+    int  printValues; // Print out input/output/expected arrays [UT_PRINT_VALUES]
+
+    // Constructor that parses and collects environment variables
+    EnvVars();
+
+    std::vector<ncclRedOp_t>    const& GetAllSupportedRedOps();
+    std::vector<ncclDataType_t> const& GetAllSupportedDataTypes();
+
+    static void ShowConfig();
+
+  protected:
+    std::vector<ncclRedOp_t>    redOps;    // Supported reduction ops [UT_REDOPS]
+    std::vector<ncclDataType_t> dataTypes; // Support datatypes       [UT_DATATYPES]
+
+    // Helper functions to parse environment variables
+    int GetEnvVar(std::string const varname, int defaultValue);
+    std::vector<std::string> GetEnvVarsList(std::string const varname);
+  };
+}
@@ -0,0 +1,38 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#pragma once
+
+namespace RcclUnitTesting
+{
+  typedef enum
+  {
+    TEST_SUCCESS = 0,
+    TEST_FAIL    = 1
+  } ErrCode;
+
+#define ERROR(...) printf("\033[0;31m" "[ ERROR    ] " "\033[0m" __VA_ARGS__)
+#define INFO(...)  printf("[ INFO     ] " __VA_ARGS__)
+
+#define CHECK_CALL(func)                              \
+  {                                                   \
+    ErrCode status = func;                            \
+    if (status != TEST_SUCCESS)                       \
+    {                                                 \
+      ERROR("Error in call %s\n", #func);             \
+      return status;                                  \
+    }                                                 \
+  }
+
+#define CHECK_HIP(func)                                                 \
+  {                                                                     \
+    hipError_t error = (func);                                          \
+    if (error != hipSuccess)                                            \
+    {                                                                   \
+      fprintf(stderr, "\033[0;33" "[ ERROR    ] HIP error: %s\n" "\033[m", hipGetErrorString(error)); \
+      return TEST_FAIL;                                                 \
+    }                                                                   \
+  }
+}
@@ -0,0 +1,342 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "CollectiveArgs.hpp"
+#include "PrepDataFuncs.hpp"
+#include <cstdio>
+#include <hip/hip_runtime.h>
+
+namespace RcclUnitTesting
+{
+  ErrCode DefaultPrepareDataFunc(CollectiveArgs &collArgs)
+  {
+    switch (collArgs.funcType)
+    {
+    case ncclCollBroadcast:     return DefaultPrepData_Broadcast(collArgs);
+    case ncclCollReduce:        return DefaultPrepData_Reduce(collArgs, false);
+    case ncclCollAllGather:     return DefaultPrepData_Gather(collArgs, true);
+    case ncclCollReduceScatter: return DefaultPrepData_ReduceScatter(collArgs);
+    case ncclCollAllReduce:     return DefaultPrepData_Reduce(collArgs, true);
+    case ncclCollGather:        return DefaultPrepData_Gather(collArgs, false);
+    case ncclCollScatter:       return DefaultPrepData_Scatter(collArgs);
+    case ncclCollAllToAll:      return DefaultPrepData_AllToAll(collArgs);
+      //case ncclCollSendRecv:      return DefaultPrepData_SendRecv(collArgs);
+    default:
+      ERROR("Unknown func type %d\n", collArgs.funcType);
+      return TEST_FAIL;
+    }
+  }
+
+  ErrCode CheckAllocation(CollectiveArgs const& collArgs)
+  {
+    if (collArgs.numInputElements > collArgs.numInputElementsAllocated)
+    {
+      ERROR("Number of input elements (%lu) exceeds the number of allocated input elements (%lu)\n",
+            collArgs.numInputElements, collArgs.numInputElementsAllocated);
+      return TEST_FAIL;
+    }
+
+    if (collArgs.numOutputElements > collArgs.numOutputElementsAllocated)
+    {
+      ERROR("Number of output elements (%lu) exceeds the number of allocated output elements (%lu)\n",
+            collArgs.numOutputElements, collArgs.numOutputElementsAllocated);
+      return TEST_FAIL;
+    }
+    return TEST_SUCCESS;
+  }
+
+  ErrCode DefaultPrepData_Broadcast(CollectiveArgs &collArgs)
+  {
+    CHECK_CALL(CheckAllocation(collArgs));
+    if (collArgs.numInputElements != collArgs.numOutputElements)
+    {
+      ERROR("Number of input elements must match number of output elements for Broadcast\n");
+      return TEST_FAIL;
+    }
+
+    size_t const numBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
+
+    // Clear output for all ranks (done before filling input in case of in-place)
+    CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numBytes));
+
+    // Only root needs input pattern
+    if (collArgs.globalRank == collArgs.root)
+      CHECK_CALL(collArgs.inputGpu.FillPattern(collArgs.dataType,
+                                               collArgs.numInputElements,
+                                               collArgs.root, true));
+
+    // Otherwise all other ranks expected output is the same as input of root
+    return collArgs.expected.FillPattern(collArgs.dataType,
+                                         collArgs.numInputElements,
+                                         collArgs.root,
+                                         false);
+  }
+
+  ErrCode DefaultPrepData_Reduce(CollectiveArgs &collArgs, bool const isAllReduce)
+  {
+    CHECK_CALL(CheckAllocation(collArgs));
+    if (collArgs.numInputElements != collArgs.numOutputElements)
+    {
+      ERROR("Number of input elements must match number of output elements for Reduce\n");
+      return TEST_FAIL;
+    }
+
+    size_t const numBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
+
+    // Clear output for all ranks (done before filling input in case of in-place)
+    CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numBytes));
+
+    // Clear expected buffer for holding reduction
+    PtrUnion result;
+    CHECK_CALL(result.Attach(collArgs.expected));
+    CHECK_CALL(result.ClearCpuMem(numBytes));
+
+    // If average or custom reduction operator is used, perform a summation instead
+    ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp);
+
+    // Loop over each rank and generate their input into a temp buffer, then reduce
+    PtrUnion scalarsPerRank;
+    scalarsPerRank.Attach(collArgs.scalarTransport.ptr);
+
+    PtrUnion tempInputCpu;
+    CHECK_CALL(tempInputCpu.Attach(collArgs.outputCpu));
+    for (int rank = 0; rank < collArgs.totalRanks; ++rank)
+    {
+      // Generate temporary input for this rank
+      CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
+
+      // Copy the pre-scaled input into GPU memory for the correct rank
+      if (rank == collArgs.globalRank)
+      {
+        CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numBytes, hipMemcpyHostToDevice));
+      }
+
+      // Scale the temporary input by local scalar for this rank
+      // (Used by custom reduction ops)
+      if (collArgs.scalarMode >= 0)
+      {
+        CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements,
+                                      scalarsPerRank, rank));
+      }
+
+      // Any rank that requires output reduces the scaled-inputs
+      if (isAllReduce || collArgs.root == collArgs.globalRank)
+      {
+        if (rank == 0)
+        {
+          memcpy(result.ptr, tempInputCpu.ptr, numBytes);
+        }
+        else
+        {
+          CHECK_CALL(result.Reduce(collArgs.dataType, collArgs.numInputElements,
+                                   tempInputCpu, tempOp));
+        }
+      }
+    }
+
+    // Perform averaging if necessary
+    if (collArgs.redOp == ncclAvg && (isAllReduce || collArgs.root == collArgs.globalRank))
+    {
+      CHECK_CALL(result.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
+    }
+    return TEST_SUCCESS;
+  }
+
+  ErrCode DefaultPrepData_Gather(CollectiveArgs &collArgs, bool const isAllGather)
+  {
+    CHECK_CALL(CheckAllocation(collArgs));
+    if (collArgs.totalRanks * collArgs.numInputElements != collArgs.numOutputElements)
+    {
+      ERROR("# of output elements must be total ranks * # input elements for AllGather\n");
+      return TEST_FAIL;
+    }
+
+    // Clear output for all ranks (done before filling input in case of in-place)
+    size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
+    size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
+    CHECK_CALL(collArgs.inputGpu.ClearGpuMem(numInputBytes));
+    CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numOutputBytes));
+
+    PtrUnion result;
+    CHECK_CALL(result.Attach(collArgs.expected.ptr));
+    CHECK_CALL(result.ClearCpuMem(numOutputBytes));
+
+    // Use outputCpu buffer to store temporary input
+    PtrUnion tempInputCpu;
+    CHECK_CALL(tempInputCpu.Attach(collArgs.outputCpu.ptr));
+
+    for (int rank = 0; rank < collArgs.totalRanks; ++rank)
+    {
+      CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
+      if (rank == collArgs.globalRank)
+      {
+        CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numInputBytes, hipMemcpyHostToDevice));
+      }
+      if (isAllGather || collArgs.root == collArgs.globalRank)
+      {
+        memcpy(result.I1 + (rank * numInputBytes), tempInputCpu.ptr, numInputBytes);
+      }
+    }
+    return TEST_SUCCESS;
+  }
+
+  ErrCode DefaultPrepData_ReduceScatter(CollectiveArgs &collArgs)
+  {
+    CHECK_CALL(CheckAllocation(collArgs));
+    if (collArgs.numInputElements != collArgs.numOutputElements * collArgs.totalRanks)
+    {
+      ERROR("# of input elements must be total ranks * # output elements for ReduceScatter\n");
+      return TEST_FAIL;
+    }
+
+    size_t const numInputBytes  = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
+    size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
+
+    // Clear output for all ranks (done before filling input in case of in-place)
+    CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numOutputBytes));
+
+    PtrUnion tempInputCpu;
+    PtrUnion tempResultCpu;
+
+    CHECK_CALL(tempInputCpu.AllocateCpuMem(numInputBytes));
+    CHECK_CALL(tempResultCpu.AllocateCpuMem(numInputBytes));
+    CHECK_CALL(tempResultCpu.ClearCpuMem(numInputBytes));
+
+    // If average or custom reduction operator is used, perform a summation instead
+    ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp);
+
+    // Loop over each rank and generate the input / scale / reduce
+    PtrUnion scalarsPerRank;
+    scalarsPerRank.Attach(collArgs.scalarTransport.ptr);
+    for (int rank = 0; rank < collArgs.totalRanks; ++rank)
+    {
+      CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
+
+      if (rank == collArgs.globalRank)
+      {
+        if (hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numInputBytes, hipMemcpyHostToDevice) != hipSuccess)
+        {
+          ERROR("hipMemcpy to input failed\n");
+          CHECK_CALL(tempInputCpu.FreeCpuMem());
+          CHECK_CALL(tempResultCpu.FreeCpuMem());
+          return TEST_FAIL;
+        }
+      }
+
+      // Scale the temporary input by local scalar for this rank
+      // (Used by custom reduction ops)
+      if (collArgs.scalarMode >= 0)
+      {
+        CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements,
+                                      scalarsPerRank, rank));
+      }
+
+      if (rank == 0)
+      {
+        memcpy(tempResultCpu.ptr, tempInputCpu.ptr, numInputBytes);
+      }
+      else
+      {
+        CHECK_CALL(tempResultCpu.Reduce(collArgs.dataType, collArgs.numInputElements,
+                                        tempInputCpu, tempOp));
+      }
+    }
+
+    // Perform averaging if necessary
+    if (collArgs.redOp == ncclAvg)
+    {
+      CHECK_CALL(tempResultCpu.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
+    }
+
+    // Copy over portion of result
+    memcpy(collArgs.expected.I1,
+           tempResultCpu.I1 + collArgs.globalRank * numOutputBytes,
+           numOutputBytes);
+    CHECK_CALL(tempInputCpu.FreeCpuMem());
+    CHECK_CALL(tempResultCpu.FreeCpuMem());
+    return TEST_SUCCESS;
+  }
+
+  ErrCode DefaultPrepData_Scatter(CollectiveArgs &collArgs)
+  {
+    CHECK_CALL(CheckAllocation(collArgs));
+    if (collArgs.numInputElements != collArgs.numOutputElements * collArgs.totalRanks)
+    {
+      ERROR("# of input elements must be total ranks * # output elements for Scatter\n");
+      return TEST_FAIL;
+    }
+
+    size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
+    size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
+
+    // Clear outputs on all ranks (prior to input in case of in-place)
+    collArgs.outputGpu.ClearGpuMem(numOutputBytes);
+
+    // Generate input as if on root rank - each rank will receive a portion
+    PtrUnion tempInput;
+    tempInput.AllocateCpuMem(numInputBytes);
+    tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.root, false);
+
+    // Copy input to root rank
+    if (collArgs.globalRank == collArgs.root)
+    {
+      if (hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice) != hipSuccess)
+      {
+        ERROR("hipMemcpy to input failed\n");
+        tempInput.FreeCpuMem();
+        return TEST_FAIL;
+      }
+    }
+    else
+    {
+      collArgs.inputGpu.ClearGpuMem(numInputBytes);
+    }
+
+    // Each rank receive a portion of the input
+    memcpy(collArgs.expected.U1, tempInput.U1 + (collArgs.globalRank * numOutputBytes), numOutputBytes);
+
+    tempInput.FreeCpuMem();
+    return TEST_SUCCESS;
+  }
+
+  ErrCode DefaultPrepData_AllToAll(CollectiveArgs &collArgs)
+  {
+    CHECK_CALL(CheckAllocation(collArgs));
+    if (collArgs.numInputElements != collArgs.numOutputElements)
+    {
+      ERROR("Number of input elements must match number of output elements for AllToAll\n");
+      return TEST_FAIL;
+    }
+    if (collArgs.numInputElements % collArgs.totalRanks)
+    {
+      ERROR("Input / Output size for AllToAll must be a multiple of %d\n", collArgs.totalRanks);
+      return TEST_FAIL;
+    }
+    size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
+    size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
+    size_t const numBytes = numInputBytes / collArgs.totalRanks;
+
+    // Clear outputs on all ranks (prior to input in case of in-place)
+    collArgs.outputGpu.ClearGpuMem(numOutputBytes);
+
+    // Generate input on root rank - each rank will receive a portion
+    PtrUnion tempInput;
+    tempInput.Attach(collArgs.outputCpu);
+
+    for (int rank = 0; rank < collArgs.totalRanks; ++rank)
+    {
+      tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false);
+
+      // Copy input
+      if (rank == collArgs.globalRank)
+      {
+        CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice));
+      }
+      memcpy(collArgs.expected.U1 + (numBytes * rank), tempInput.U1 + (numBytes * collArgs.globalRank), numBytes);
+    }
+    return TEST_SUCCESS;
+  }
+}
@@ -0,0 +1,26 @@
+ /*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#pragma once
+#include "ErrCode.hpp"
+
+namespace RcclUnitTesting
+{
+  class CollectiveArgs;
+
+  // Checks that enough memory has been allocated
+  ErrCode CheckAllocation(CollectiveArgs const& collArgs);
+
+  // Default PrepareData functions
+  // PrepareData functions are responsible for setting up input / expected for the given collArgs
+  ErrCode DefaultPrepareDataFunc(CollectiveArgs &collArgs);
+  ErrCode DefaultPrepData_Broadcast(CollectiveArgs &collArgs);
+  ErrCode DefaultPrepData_Reduce(CollectiveArgs &collArgs, bool const isAllReduce);
+  ErrCode DefaultPrepData_Gather(CollectiveArgs &collArgs, bool const isAllGather);
+  ErrCode DefaultPrepData_ReduceScatter(CollectiveArgs &collArgs);
+  ErrCode DefaultPrepData_Scatter(CollectiveArgs &collArgs);
+  ErrCode DefaultPrepData_AllToAll(CollectiveArgs &collArgs);
+  ErrCode DefaultPrepData_SendRecv(CollectiveArgs &collArgs);
+}
@@ -0,0 +1,354 @@
+#include "PtrUnion.hpp"
+
+namespace RcclUnitTesting
+{
+  size_t DataTypeToBytes(ncclDataType_t const dataType)
+  {
+    switch (dataType)
+    {
+    case ncclInt8:   return 1;
+    case ncclUint8:  return 1;
+    case ncclInt32:  return 4;
+    case ncclUint32: return 4;
+    case ncclInt64:  return 8;
+    case ncclUint64: return 8;
+    case ncclFloat16: return 2;
+    case ncclFloat32: return 4;
+    case ncclFloat64: return 8;
+    case ncclBfloat16: return 2;
+    default:
+      ERROR("Unsupported datatype (%d)\n", dataType);
+      exit(0);
+    }
+  }
+
+  ErrCode PtrUnion::Attach(void *ptr)
+  {
+    this->ptr = ptr;
+    return TEST_SUCCESS;
+  }
+
+  ErrCode PtrUnion::Attach(PtrUnion ptrUnion)
+  {
+    this->ptr = ptrUnion.ptr;
+    return TEST_SUCCESS;
+  }
+
+  ErrCode PtrUnion::AllocateGpuMem(size_t const numBytes, bool const useManagedMem)
+  {
+    if (numBytes)
+    {
+      if (useManagedMem)
+      {
+        if (hipMallocManaged(&I1, numBytes) != hipSuccess)
+        {
+          ERROR("Unable to allocate managed memory of GPU memory (%lu bytes)\n", numBytes);
+          return TEST_FAIL;
+        }
+      }
+      else
+      {
+        if (hipMalloc(&I1, numBytes) != hipSuccess)
+        {
+          ERROR("Unable to allocate memory of GPU memory (%lu bytes)\n", numBytes);
+          return TEST_FAIL;
+        }
+      }
+    }
+    return TEST_SUCCESS;
+  }
+
+  ErrCode PtrUnion::AllocateCpuMem(size_t const numBytes)
+  {
+    if (numBytes)
+    {
+      this->ptr = calloc(numBytes, 1);
+      if (!ptr)
+      {
+        ERROR("Unable to allocate memory (%lu bytes)\n", numBytes);
+        return TEST_FAIL;
+      }
+    }
+    return TEST_SUCCESS;
+  }
+
+  ErrCode PtrUnion::FreeGpuMem()
+  {
+    if (this->ptr != nullptr)
+    {
+      hipFree(this->ptr);
+      this->ptr = nullptr;
+    }
+    return TEST_SUCCESS;
+  }
+
+  ErrCode PtrUnion::FreeCpuMem()
+  {
+    if (this->ptr != nullptr)
+    {
+      free(this->ptr);
+      this->ptr = nullptr;
+    }
+    return TEST_SUCCESS;
+  }
+
+  ErrCode PtrUnion::ClearGpuMem(size_t const numBytes)
+  {
+    if (hipMemset(this->ptr, 0, numBytes) != hipSuccess)
+    {
+      ERROR("Unable to call hipMemset\n");
+      return TEST_FAIL;
+    }
+    return TEST_SUCCESS;
+  }
+
+  ErrCode PtrUnion::ClearCpuMem(size_t const numBytes)
+  {
+    memset(this->ptr, 0, numBytes);
+    return TEST_SUCCESS;
+  }
+
+  ErrCode PtrUnion::FillPattern(ncclDataType_t const dataType,
+                                size_t         const numElements,
+                                int            const globalRank,
+                                bool           const isGpuMem)
+  {
+    PtrUnion temp;
+    size_t const numBytes = numElements * DataTypeToBytes(dataType);
+
+    // If this is GPU memory, create a CPU temp buffer otherwise fill CPU memory directly
+    if (isGpuMem)
+      temp.AllocateCpuMem(numBytes);
+    else
+      temp.Attach(this->ptr);
+
+    for (int i = 0; i < numElements; i++)
+    {
+      int    valueI = (globalRank + i) % 256;
+      double valueF = 1.0L/((double)valueI+1.0L);
+      temp.Set(dataType, i, valueI, valueF);
+    }
+
+    // If this is GPU memory, copy from CPU temp buffer
+    if (isGpuMem)
+    {
+      if (hipMemcpy(this->ptr, temp.ptr, numBytes, hipMemcpyHostToDevice) != hipSuccess)
+      {
+        ERROR("Unable to fill input with pattern for rank %d\n", globalRank);
+        return TEST_FAIL;
+      }
+      temp.FreeCpuMem();
+    }
+
+    return TEST_SUCCESS;
+  }
+
+  ErrCode PtrUnion::Set(ncclDataType_t const dataType, int const idx, int valueI, double valueF)
+  {
+    switch (dataType)
+    {
+    case ncclInt8:     I1[idx] = valueI; break;
+    case ncclUint8:    U1[idx] = valueI; break;
+    case ncclInt32:    I4[idx] = valueI; break;
+    case ncclUint32:   U4[idx] = valueI; break;
+    case ncclInt64:    I8[idx] = valueI; break;
+    case ncclUint64:   U8[idx] = valueI; break;
+    case ncclFloat32:  F4[idx] = valueF; break;
+    case ncclFloat64:  F8[idx] = valueF; break;
+    case ncclBfloat16: B2[idx] = rccl_bfloat16(static_cast<float>(valueF)); break;
+    default:
+      ERROR("Unsupported datatype\n");
+      return TEST_FAIL;
+    }
+    return TEST_SUCCESS;
+  }
+
+  ErrCode PtrUnion::Get(ncclDataType_t const dataType, int const idx, int& valueI, double& valueF) const
+  {
+    switch (dataType)
+    {
+    case ncclInt8:     valueI = I1[idx]; break;
+    case ncclUint8:    valueI = I1[idx]; break;
+    case ncclInt32:    valueI = I4[idx]; break;
+    case ncclUint32:   valueI = U4[idx]; break;
+    case ncclInt64:    valueI = I8[idx]; break;
+    case ncclUint64:   valueI = U8[idx]; break;
+    case ncclFloat32:  valueF = F4[idx]; break;
+    case ncclFloat64:  valueF = F8[idx]; break;
+    case ncclBfloat16: valueF = B2[idx]; break;
+    default:
+      ERROR("Unsupported datatype\n");
+      return TEST_FAIL;
+    }
+    return TEST_SUCCESS;
+  }
+
+  // Multiplies in-place each element by scalarsPerRank[rank]
+  ErrCode PtrUnion::Scale(ncclDataType_t const  dataType,
+                          size_t         const  numElements,
+                          PtrUnion       const& scalarsPerRank,
+                          int            const  rank)
+  {
+    // If no scalars are provided do nothing
+    if (scalarsPerRank.ptr == nullptr) return TEST_SUCCESS;
+
+    for (size_t idx = 0; idx < numElements; ++idx)
+    {
+      switch (dataType)
+      {
+      case ncclInt8:     I1[idx] *= scalarsPerRank.I1[rank]; break;
+      case ncclUint8:    U1[idx] *= scalarsPerRank.U1[rank]; break;
+      case ncclInt32:    I4[idx] *= scalarsPerRank.I4[rank]; break;
+      case ncclUint32:   U4[idx] *= scalarsPerRank.U4[rank]; break;
+      case ncclInt64:    I8[idx] *= scalarsPerRank.I8[rank]; break;
+      case ncclUint64:   U8[idx] *= scalarsPerRank.U8[rank]; break;
+      case ncclFloat32:  F4[idx] *= scalarsPerRank.F4[rank]; break;
+      case ncclFloat64:  F8[idx] *= scalarsPerRank.F8[rank]; break;
+      case ncclBfloat16: B2[idx] *= scalarsPerRank.B2[rank]; break;
+      default:
+        ERROR("Unsupported datatype\n");
+        return TEST_FAIL;
+      }
+    }
+    return TEST_SUCCESS;
+  }
+
+  ErrCode PtrUnion::Reduce(ncclDataType_t const  dataType,
+                           size_t         const  numElements,
+                           PtrUnion       const& inputCpu,
+                           ncclRedOp_t    const  op)
+  {
+    if (inputCpu.ptr == nullptr)
+    {
+      ERROR("Input pointer to Reduce should not be nullptr\n");
+      return TEST_FAIL;
+    }
+
+    for (size_t idx = 0; idx < numElements; ++idx)
+    {
+      switch (dataType)
+      {
+      case ncclInt8:     I1[idx] = ReduceOp(op, I1[idx], inputCpu.I1[idx]); break;
+      case ncclUint8:    U1[idx] = ReduceOp(op, U1[idx], inputCpu.U1[idx]); break;
+      case ncclInt32:    I4[idx] = ReduceOp(op, I4[idx], inputCpu.I4[idx]); break;
+      case ncclUint32:   U4[idx] = ReduceOp(op, U4[idx], inputCpu.U4[idx]); break;
+      case ncclInt64:    I8[idx] = ReduceOp(op, I8[idx], inputCpu.I8[idx]); break;
+      case ncclUint64:   U8[idx] = ReduceOp(op, U8[idx], inputCpu.U8[idx]); break;
+      case ncclFloat32:  F4[idx] = ReduceOp(op, F4[idx], inputCpu.F4[idx]); break;
+      case ncclFloat64:  F8[idx] = ReduceOp(op, F8[idx], inputCpu.F8[idx]); break;
+      case ncclBfloat16: B2[idx] = ReduceOp(op, B2[idx], inputCpu.B2[idx]); break;
+      default:
+        ERROR("Unsupported datatype\n");
+        return TEST_FAIL;
+      }
+    }
+    return TEST_SUCCESS;
+  }
+
+
+  ErrCode PtrUnion::DivideByInt(ncclDataType_t const dataType,
+                                size_t         const numElements,
+                                int            const divisor)
+  {
+    for (size_t idx = 0; idx < numElements; ++idx)
+    {
+      switch (dataType)
+      {
+      case ncclInt8:     I1[idx] /= divisor; break;
+      case ncclUint8:    U1[idx] /= divisor; break;
+      case ncclInt32:    I4[idx] /= divisor; break;
+      case ncclUint32:   U4[idx] /= divisor; break;
+      case ncclInt64:    I8[idx] /= divisor; break;
+      case ncclUint64:   U8[idx] /= divisor; break;
+      case ncclFloat32:  F4[idx] /= divisor; break;
+      case ncclFloat64:  F8[idx] /= divisor; break;
+      case ncclBfloat16: B2[idx] = (rccl_bfloat16((float)(B2[idx]) / divisor)); break;
+      default:
+        ERROR("Unsupported datatype\n");
+        return TEST_FAIL;
+      }
+    }
+    return TEST_SUCCESS;
+  }
+
+  ErrCode PtrUnion::IsEqual(ncclDataType_t const  dataType,
+                            size_t         const  numElements,
+                            PtrUnion       const& expected,
+                            bool           const  verbose,
+                            bool&                 isMatch)
+  {
+    isMatch = true;
+    size_t idx = 0;
+    for (idx = 0; idx < numElements; ++idx)
+    {
+      switch (dataType)
+      {
+      case ncclInt8:    isMatch = (I1[idx] == expected.I1[idx]); break;
+      case ncclUint8:   isMatch = (U1[idx] == expected.U1[idx]); break;
+      case ncclInt32:   isMatch = (I4[idx] == expected.I4[idx]); break;
+      case ncclUint32:  isMatch = (U4[idx] == expected.U4[idx]); break;
+      case ncclInt64:   isMatch = (I8[idx] == expected.I8[idx]); break;
+      case ncclUint64:  isMatch = (U8[idx] == expected.U8[idx]); break;
+      case ncclFloat32: isMatch = (fabs(F4[idx] - expected.F4[idx]) < 1e-5); break;
+      case ncclFloat64: isMatch = (fabs(F8[idx] - expected.F8[idx]) < 1e-12); break;
+      case ncclBfloat16: isMatch = (fabs((float)B2[idx] - (float)expected.B2[idx]) < 9e-2); break;
+      default:
+        ERROR("Unsupported datatype\n");
+        return TEST_FAIL;
+      }
+      if (!isMatch) break;
+    }
+
+    if (verbose && !isMatch)
+    {
+      switch (dataType)
+      {
+      case ncclInt8:
+        ERROR("Expected output: %d.  Actual output: %d at index %lu\n", expected.I1[idx], I1[idx], idx); break;
+      case ncclUint8:
+        ERROR("Expected output: %u.  Actual output: %u at index %lu\n", expected.U1[idx], U1[idx], idx); break;
+      case ncclInt32:
+        ERROR("Expected output: %d.  Actual output: %d at index %lu\n", expected.I4[idx], I4[idx], idx); break;
+      case ncclUint32:
+        ERROR("Expected output: %u.  Actual output: %u at index %lu\n", expected.U4[idx], U4[idx], idx); break;
+      case ncclInt64:
+        ERROR("Expected output: %ld.  Actual output: %ld at index %lu\n", expected.I8[idx], I8[idx], idx); break;
+      case ncclUint64:
+        ERROR("Expected output: %lu.  Actual output: %lu at index %lu\n", expected.U8[idx], U8[idx], idx); break;
+      case ncclFloat32:
+        ERROR("Expected output: %f.  Actual output: %f at index %lu\n", expected.F4[idx], F4[idx], idx); break;
+      case ncclFloat64:
+        ERROR("Expected output: %lf.  Actual output: %lf at index %lu\n", expected.F8[idx], F8[idx], idx); break;
+      case ncclBfloat16:
+        ERROR("Expected output: %f.  Actual output: %f at index %lu\n", (float)expected.B2[idx], (float)B2[idx], idx); break;
+      default:
+        break;
+      }
+    }
+    return TEST_SUCCESS;
+  }
+
+  std::string PtrUnion::ToString(ncclDataType_t const  dataType,
+                                 size_t         const  numElements) const
+  {
+    std::stringstream ss;
+    for (int i = 0; i < numElements; i++)
+    {
+      if (i) ss <<  " ";
+      switch (dataType)
+      {
+      case ncclInt8:     ss << I1[i]; break;
+      case ncclUint8:    ss << U1[i]; break;
+      case ncclInt32:    ss << I4[i]; break;
+      case ncclUint32:   ss << U4[i]; break;
+      case ncclInt64:    ss << I8[i]; break;
+      case ncclUint64:   ss << U8[i]; break;
+      case ncclFloat32:  ss << F4[i]; break;
+      case ncclFloat64:  ss << F8[i]; break;
+      case ncclBfloat16: ss << (float)B2[i]; break;
+      default: break;
+      }
+    }
+    return ss.str();
+  }
+}
@@ -0,0 +1,90 @@
+#pragma once
+#include "ErrCode.hpp"
+#include "rccl.h"
+#include "rccl_bfloat16.h"
+
+namespace RcclUnitTesting
+{
+  // Performs the various basic reduction operations
+  template <typename T>
+  T ReduceOp(ncclRedOp_t const op, T const A, T const B)
+  {
+    switch (op)
+    {
+    case ncclSum:  return A + B;
+    case ncclProd: return A * B;
+    case ncclMax:  return std::max(A, B);
+    case ncclMin:  return std::min(A, B);
+    default:
+      ERROR("Unsupported reduction operator (%d)\n", op);
+      exit(0);
+    }
+  }
+
+  size_t DataTypeToBytes(ncclDataType_t const dataType);
+
+  // PtrUnion encapsulates a pointer of all the different supported datatypes
+  // NOTE: Currently half-precision float tests are unsupported due to half
+  //       being supported on GPU only and not host
+  union PtrUnion
+  {
+    void*          ptr;
+    int8_t*        I1; // ncclInt8
+    uint8_t*       U1; // ncclUint8
+    int32_t*       I4; // ncclInt32
+    uint32_t*      U4; // ncclUint32
+    int64_t*       I8; // ncclInt64
+    uint64_t*      U8; // ncclUint64
+    float*         F4; // ncclFloat32
+    double*        F8; // ncclFloat64
+    rccl_bfloat16* B2; // ncclBfloat16
+
+    ErrCode Attach(void *ptr);
+    ErrCode Attach(PtrUnion ptrUnion);
+
+    ErrCode AllocateGpuMem(size_t const numBytes, bool const useManagedMem = false);
+    ErrCode AllocateCpuMem(size_t const numBytes);
+
+    ErrCode FreeGpuMem();
+    ErrCode FreeCpuMem();
+
+    ErrCode ClearGpuMem(size_t const numBytes);
+    ErrCode ClearCpuMem(size_t const numBytes);
+
+    ErrCode FillPattern(ncclDataType_t const dataType,
+                        size_t         const numElements,
+                        int            const globalRank,
+                        bool           const isGpuMem);
+
+    ErrCode Set(ncclDataType_t const dataType, int const idx, int valueI, double valueF);
+    ErrCode Get(ncclDataType_t const dataType, int const idx, int& valueI, double& valueF) const;
+
+    // Multiplies in-place each element by scalarsPerRank[rank]
+    ErrCode Scale(ncclDataType_t const  dataType,
+                  size_t         const  numElements,
+                  PtrUnion       const& scalarsPerRank,
+                  int            const  rank);
+
+    // Reduces input into this PtrUnion
+    ErrCode Reduce(ncclDataType_t const  dataType,
+                   size_t         const  numElements,
+                   PtrUnion       const& inputCpu,
+                   ncclRedOp_t    const  op);
+
+    // Divide each element by a integer value
+    ErrCode DivideByInt(ncclDataType_t const dataType,
+                        size_t         const numElements,
+                        int            const divisor);
+
+    // Compares for equality (fuzzy comparision for floating point types)
+    ErrCode IsEqual(ncclDataType_t const  dataType,
+                    size_t         const  numElements,
+                    PtrUnion       const& expected,
+                    bool           const  verbose,
+                    bool&                 isMatch);
+
+    // Output to string (for debug)
+    std::string ToString(ncclDataType_t const  dataType,
+                         size_t         const  numElements) const;
+  };
+}
@@ -0,0 +1,485 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include <unistd.h>
+#include "TestBed.hpp"
+#include <rccl.h>
+
+#define PIPE_WRITE(childId, val)                                        \
+  ASSERT_EQ(write(childList[childId]->parentWriteFd, &val, sizeof(val)), sizeof(val))
+
+#define PIPE_CHECK(childId)                                             \
+  {                                                                     \
+    int response = 0;                                                   \
+    ASSERT_EQ(read(childList[childId]->parentReadFd, &response, sizeof(int)), sizeof(int)); \
+    ASSERT_EQ(response, TEST_SUCCESS);                                  \
+  }
+
+namespace RcclUnitTesting
+{
+  TestBed::TestBed() :
+    numDevicesAvailable(0),
+    numActiveChildren(0),
+    numActiveRanks(0)
+  {
+    // Set NCCL_COMM_ID to use a local port to avoid passing ncclCommId
+    // Calling ncclGetUniqueId would initialize HIP, which should not be done prior to fork
+    std::string localPort = "55513";
+    if (!getenv("NCCL_COMM_ID"))
+    {
+      char hostname[HOST_NAME_MAX+1];
+      gethostname(hostname, HOST_NAME_MAX+1);
+      std::string hostnameString(hostname);
+      hostnameString.append(":55513");
+      setenv("NCCL_COMM_ID", hostnameString.c_str(), 0);
+      if (ev.verbose) INFO("NCCL_COMM_ID set to %s\n", hostnameString.c_str());
+    }
+
+    // Collect the number of GPUs
+    this->numDevicesAvailable = ev.maxGpus;
+    if (ev.verbose) INFO("Detected %d GPUs\n", this->numDevicesAvailable);
+
+    // Create the maximum number of possible child processes (1 per GPU)
+    // Parent and child communicate via pipes
+    childList.resize(this->numDevicesAvailable);
+    for (int childId = 0; childId < this->numDevicesAvailable; ++childId)
+    {
+      childList[childId] = new TestBedChild(childId, ev.verbose, ev.printValues);
+      if (childList[childId]->InitPipes() != TEST_SUCCESS)
+      {
+        ERROR("Unable to create pipes to child process\n");
+        return;
+      }
+
+      pid_t pid = fork();
+      if (pid == 0)
+      {
+        // Child process enters execution loop
+        childList[childId]->StartExecutionLoop();
+        return;
+      }
+      else
+      {
+        // Parent records child process ID and closes unused ends of pipe
+        childList[childId]->pid = pid;
+        close(childList[childId]->childWriteFd);
+        close(childList[childId]->childReadFd);
+      }
+    }
+  }
+
+  void TestBed::InitComms(std::vector<std::vector<int>> const& deviceIdsPerProcess,
+                          int const numCollectivesInGroup)
+  {
+    // Count up the total number of GPUs to use and track child/deviceId per rank
+    this->numActiveChildren = deviceIdsPerProcess.size();
+    this->numActiveRanks = 0;
+    this->numCollectivesInGroup = numCollectivesInGroup;
+    this->rankToChildMap.clear();
+    this->rankToDeviceMap.clear();
+    if (ev.verbose) INFO("Setting up %d active child processes\n", this->numActiveChildren);
+    for (int childId = 0; childId < this->numActiveChildren; ++childId)
+    {
+      for (auto i = 0; i < deviceIdsPerProcess[childId].size(); ++i)
+      {
+        this->rankToChildMap.push_back(childId);
+        this->rankToDeviceMap.push_back(deviceIdsPerProcess[childId][i]);
+        ++this->numActiveRanks;
+      }
+    }
+
+    // Send InitComms command to each active child process
+    int const cmd = TestBedChild::CHILD_INIT_COMMS;
+    int rankOffset = 0;
+    for (int childId = 0; childId < this->numActiveChildren; ++childId)
+    {
+      PIPE_WRITE(childId, cmd);
+
+      // Send total number of ranks to child process
+      PIPE_WRITE(childId, this->numActiveRanks);
+
+      // Send the rank offset for this child process
+      PIPE_WRITE(childId, rankOffset);
+
+      // Send the number of collectives to be run per group call
+      PIPE_WRITE(childId, numCollectivesInGroup);
+
+      // Send the GPUs this child uses
+      int const numGpus = deviceIdsPerProcess[childId].size();
+      PIPE_WRITE(childId, numGpus);
+      for (int i = 0; i < numGpus; i++)
+        PIPE_WRITE(childId, deviceIdsPerProcess[childId][i]);
+
+      rankOffset += numGpus;
+    }
+
+    // Wait for child acknowledgement
+    // This is done after previous loop to avoid deadlock as every rank needs to enter ncclInitCommRank
+    for (int childId = 0; childId < this->numActiveChildren; ++childId)
+    {
+      PIPE_CHECK(childId);
+    }
+  }
+
+  void TestBed::InitComms(int const numGpus, int const numCollectivesInGroup)
+  {
+    InitComms(TestBed::GetDeviceIdsList(1, numGpus), numCollectivesInGroup);
+  }
+
+  void TestBed::SetCollectiveArgs(ncclFunc_t     const funcType,
+                                  ncclDataType_t const dataType,
+                                  ncclRedOp_t    const redOp,
+                                  int            const root,
+                                  size_t         const numInputElements,
+                                  size_t         const numOutputElements,
+                                  int            const collId,
+                                  int            const rank,
+                                  PtrUnion       const scalarsPerRank,
+                                  int            const scalarMode)
+  {
+    // Build list of ranks this applies to (-1 for rank means to set for all)
+    std::vector<int> rankList;
+    for (int i = 0; i < this->numActiveRanks; ++i)
+      if (rank == -1 || rank == i) rankList.push_back(i);
+
+    ScalarTransport scalarTransport;
+    if (scalarMode >= 0)
+    {
+      ASSERT_TRUE(scalarsPerRank.ptr != NULL);
+
+      // Capture scalars per rank in format to share with child processes
+      int const numBytes = this->numActiveRanks * DataTypeToBytes(dataType);
+      memcpy(scalarTransport.ptr, scalarsPerRank.ptr, numBytes);
+    }
+
+    // Loop over all ranks and send CollectiveArgs to appropriate child process
+    int const cmd = TestBedChild::CHILD_SET_COLL_ARGS;
+    for (auto currRank : rankList)
+    {
+      int const childId = rankToChildMap[currRank];
+      PIPE_WRITE(childId, cmd);
+      PIPE_WRITE(childId, currRank);
+      PIPE_WRITE(childId, collId);
+      PIPE_WRITE(childId, funcType);
+      PIPE_WRITE(childId, dataType);
+      PIPE_WRITE(childId, redOp);
+      PIPE_WRITE(childId, root);
+      PIPE_WRITE(childId, numInputElements);
+      PIPE_WRITE(childId, numOutputElements);
+      PIPE_WRITE(childId, scalarMode);
+      PIPE_WRITE(childId, scalarTransport);
+      PIPE_CHECK(childId);
+    }
+  }
+
+  void TestBed::AllocateMem(bool   const inPlace,
+                            bool   const useManagedMem,
+                            int    const collId,
+                            int    const rank)
+  {
+    // Build list of ranks this applies to (-1 for rank means to set for all)
+    std::vector<int> rankList;
+    for (int i = 0; i < this->numActiveRanks; ++i)
+      if (rank == -1 || rank == i) rankList.push_back(i);
+
+    // Loop over all ranks and send allocation command to appropriate child process
+    int const cmd = TestBedChild::CHILD_ALLOCATE_MEM;
+    for (auto currRank : rankList)
+    {
+      int const childId = rankToChildMap[currRank];
+      PIPE_WRITE(childId, cmd);
+      PIPE_WRITE(childId, currRank);
+      PIPE_WRITE(childId, collId);
+      PIPE_WRITE(childId, inPlace);
+      PIPE_WRITE(childId, useManagedMem);
+      PIPE_CHECK(childId);
+    }
+  }
+
+  void TestBed::PrepareData(int         const collId,
+                            int         const rank,
+                            CollFuncPtr const prepDataFunc)
+  {
+    // Build list of ranks this applies to (-1 for rank means to set for all)
+    std::vector<int> rankList;
+    for (int i = 0; i < this->numActiveRanks; ++i)
+      if (rank == -1 || rank == i) rankList.push_back(i);
+
+    // Loop over all ranks and send prepare data command to appropriate child process
+    int const cmd = TestBedChild::CHILD_PREPARE_DATA;
+    for (auto currRank : rankList)
+    {
+      int const childId = rankToChildMap[currRank];
+      PIPE_WRITE(childId, cmd);
+      PIPE_WRITE(childId, currRank);
+      PIPE_WRITE(childId, collId);
+      PIPE_WRITE(childId, prepDataFunc);
+      PIPE_CHECK(childId);
+    }
+  }
+
+  void TestBed::ExecuteCollectives()
+  {
+    int const cmd = TestBedChild::CHILD_EXECUTE_COLL;
+    ++TestBed::NumTestsRun();
+
+    // Send ExecuteColl command to each active child process
+    for (int childId = 0; childId < this->numActiveChildren; ++childId)
+    {
+      PIPE_WRITE(childId, cmd);
+    }
+
+    // Wait for child acknowledgement
+    for (int childId = 0; childId < this->numActiveChildren; ++childId)
+    {
+      PIPE_CHECK(childId);
+    }
+  }
+
+  void TestBed::ValidateResults(bool& isCorrect, int const collId, int const rank)
+  {
+    // Build list of ranks this applies to (-1 for rank means to set for all)
+    std::vector<int> rankList;
+    for (int i = 0; i < this->numActiveRanks; ++i)
+      if (rank == -1 || rank == i) rankList.push_back(i);
+
+    int const cmd = TestBedChild::CHILD_VALIDATE_RESULTS;
+
+    isCorrect = true;
+    // Send ValidateResults command to each active child process
+    for (auto currRank : rankList)
+    {
+      int const childId = rankToChildMap[currRank];
+      PIPE_WRITE(childId, cmd);
+      PIPE_WRITE(childId, currRank);
+      PIPE_WRITE(childId, collId);
+
+      int response = 0;
+      ASSERT_EQ(read(childList[childId]->parentReadFd, &response, sizeof(int)), sizeof(int));
+      isCorrect &= (response == TEST_SUCCESS);
+    }
+
+    ASSERT_EQ(isCorrect, true) << "Output does not match expected";
+  }
+
+  void TestBed::DeallocateMem(int const collId, int const rank)
+  {
+    // Build list of ranks this applies to (-1 for rank means to set for all)
+    std::vector<int> rankList;
+    for (int i = 0; i < this->numActiveRanks; ++i)
+      if (rank == -1 || rank == i) rankList.push_back(i);
+
+    int const cmd = TestBedChild::CHILD_DEALLOCATE_MEM;
+
+    for (auto currRank : rankList)
+    {
+      int const childId = rankToChildMap[currRank];
+      PIPE_WRITE(childId, cmd);
+      PIPE_WRITE(childId, currRank);
+      PIPE_WRITE(childId, collId);
+      PIPE_CHECK(childId);
+    }
+  }
+
+  void TestBed::DestroyComms()
+  {
+    int const cmd = TestBedChild::CHILD_DESTROY_COMMS;
+    for (int childId = 0; childId < this->numActiveChildren; ++childId)
+    {
+      // Send DestroyComms command to each active child process
+      PIPE_WRITE(childId, cmd);
+
+      // Wait for child acknowledgement
+      PIPE_CHECK(childId);
+    }
+
+    // Reset bookkeeping
+    this->numActiveChildren = 0;
+    this->numActiveRanks = 0;
+    this->numCollectivesInGroup = 0;
+  }
+
+  void TestBed::Finalize()
+  {
+    // Send Stop to all child processes
+    int const cmd = TestBedChild::CHILD_STOP;
+    for (int childId = 0; childId < this->numDevicesAvailable; ++childId)
+    {
+      PIPE_WRITE(childId, cmd);
+
+      // Close pipes to child process
+      close(childList[childId]->parentWriteFd);
+      close(childList[childId]->parentReadFd);
+    }
+    this->numDevicesAvailable = 0;
+  }
+
+  TestBed::~TestBed()
+  {
+    Finalize();
+  }
+
+  std::vector<ncclRedOp_t> const& TestBed::GetAllSupportedRedOps()
+  {
+    return ev.GetAllSupportedRedOps();
+  }
+
+  std::vector<ncclDataType_t> const& TestBed::GetAllSupportedDataTypes()
+  {
+    return ev.GetAllSupportedDataTypes();
+  }
+
+  std::vector<std::vector<int>> TestBed::GetDeviceIdsList(int const numProcesses,
+                                                 int const numGpus)
+  {
+    std::vector<std::vector<int>> result(numProcesses);
+    for (int i = 0; i < numGpus; i++)
+      result[i % numProcesses].push_back(i);
+    return result;
+  }
+
+  std::string TestBed::GetTestCaseName(int            const totalRanks,
+                                       bool           const isMultiProcess,
+                                       ncclFunc_t     const funcType,
+                                       ncclDataType_t const dataType,
+                                       ncclRedOp_t    const redOp,
+                                       int            const root,
+                                       bool           const inPlace,
+                                       bool           const managedMem)
+  {
+    std::stringstream ss;
+    ss << (isMultiProcess ? "MP" : "SP") <<  " ";
+    ss << totalRanks << " ranks ";
+    ss << ncclFuncNames[funcType] << " ";
+    ss << "(" << (inPlace ? "IP" : "OP") << "," << (managedMem ? "MM" : "GM") << ") ";
+    ss << ncclDataTypeNames[dataType] << " ";
+    if (CollectiveArgs::UsesReduce(funcType)) ss << ncclRedOpNames[redOp] << " ";
+    if (CollectiveArgs::UsesRoot(funcType)) ss << "Root " << root << " ";
+    return ss.str();
+  }
+
+  void TestBed::RunSimpleSweep(std::vector<ncclFunc_t>     const& funcTypes,
+                               std::vector<ncclDataType_t> const& tmpDataTypes,
+                               std::vector<ncclRedOp_t>    const& tmpRedOps,
+                               std::vector<int>            const& roots,
+                               std::vector<int>            const& numElements,
+                               std::vector<bool>           const& inPlaceList,
+                               std::vector<bool>           const& managedMemList)
+  {
+    // Sort numElements in descending order to cut down on # of allocations
+    std::vector<int> sortedN = numElements;
+    std::sort(sortedN.rbegin(), sortedN.rend());
+
+    // Filter out any unsupported datatypes, in case only subset has been compiled for
+    std::vector<ncclDataType_t> const& supportedDataTypes = this->GetAllSupportedDataTypes();
+    std::vector<ncclDataType_t> dataTypes;
+    for (auto dt : tmpDataTypes)
+    {
+      for (int i = 0; i < supportedDataTypes.size(); ++i)
+      {
+        if (supportedDataTypes[i] == dt)
+        {
+          dataTypes.push_back(dt);
+          break;
+        }
+      }
+    }
+
+    // Filter out any unsupported reduction ops, in case only subset has been compiled for
+    std::vector<ncclRedOp_t> const& supportedOps = this->GetAllSupportedRedOps();
+    std::vector<ncclRedOp_t> redOps;
+    for (auto redop : tmpRedOps)
+    {
+      for (int i = 0; i < supportedOps.size(); ++i)
+      {
+        if (supportedOps[i] == redop)
+        {
+          redOps.push_back(redop);
+          break;
+        }
+      }
+    }
+
+    bool isCorrect = true;
+
+    // Sweep over the number of ranks
+    for (int totalRanks = ev.minGpus; totalRanks <= ev.maxGpus && isCorrect; ++totalRanks)
+    for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
+    {
+      if (!(ev.processMask & (1 << isMultiProcess))) continue;
+
+      // Test either single process all GPUs, or 1 process per GPU
+      int const numProcesses = isMultiProcess ? totalRanks : 1;
+      this->InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
+
+      for (int ftIdx = 0; ftIdx < funcTypes.size()      && isCorrect; ++ftIdx)
+      for (int dtIdx = 0; dtIdx < dataTypes.size()      && isCorrect; ++dtIdx)
+      for (int rdIdx = 0; rdIdx < redOps.size()         && isCorrect; ++rdIdx)
+      for (int rtIdx = 0; rtIdx < roots.size()          && isCorrect; ++rtIdx)
+      for (int ipIdx = 0; ipIdx < inPlaceList.size()    && isCorrect; ++ipIdx)
+      for (int mmIdx = 0; mmIdx < managedMemList.size() && isCorrect; ++mmIdx)
+      {
+        if (ev.showNames)
+        {
+          std::string name = this->GetTestCaseName(totalRanks, isMultiProcess,
+                                                   funcTypes[ftIdx], dataTypes[dtIdx],
+                                                   redOps[rdIdx], roots[rtIdx],
+                                                   inPlaceList[ipIdx], managedMemList[mmIdx]);
+          INFO("%s\n", name.c_str());
+        }
+
+        for (int neIdx = 0; neIdx < numElements.size() && isCorrect; ++neIdx)
+        {
+          int numInputElements, numOutputElements;
+          CollectiveArgs::GetNumElementsForFuncType(funcTypes[ftIdx],
+                                                    sortedN[neIdx],
+                                                    totalRanks,
+                                                    &numInputElements,
+                                                    &numOutputElements);
+
+          this->SetCollectiveArgs(funcTypes[ftIdx],
+                                  dataTypes[dtIdx],
+                                  redOps[rdIdx],
+                                  roots[rtIdx],
+                                  numInputElements,
+                                  numOutputElements);
+
+          // Only allocate once for largest size
+          if (neIdx == 0) this->AllocateMem(inPlaceList[ipIdx], managedMemList[mmIdx]);
+
+          // There are some cases when data does not need to be re-prepared
+          // e.g. AllReduce subarray expected results are still valid
+          bool canSkip = (neIdx != 0 && !inPlaceList[ipIdx] &&
+                          (funcTypes[ftIdx] == ncclCollBroadcast ||
+                           funcTypes[ftIdx] == ncclCollReduce    ||
+                           funcTypes[ftIdx] == ncclCollAllReduce));
+          if (!canSkip) this->PrepareData();
+
+          this->ExecuteCollectives();
+          this->ValidateResults(isCorrect);
+          if (!isCorrect)
+          {
+            std::string name = this->GetTestCaseName(totalRanks, isMultiProcess,
+                                                     funcTypes[ftIdx], dataTypes[dtIdx],
+                                                     redOps[rdIdx], roots[rtIdx],
+                                                     inPlaceList[ipIdx], managedMemList[mmIdx]);
+            ERROR("Incorrect output for %s\n", name.c_str());
+          }
+        }
+        this->DeallocateMem();
+      }
+      this->DestroyComms();
+    }
+  }
+
+  int& TestBed::NumTestsRun()
+  {
+    static int numTestsRun = 0;
+    return numTestsRun;
+  }
+}
+
+#undef PIPE_WRITE
+#undef PIPE_CHECK
@@ -0,0 +1,129 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#pragma once
+#include <map>
+#include "CollectiveArgs.hpp"
+#include "TestBedChild.hpp"
+#include "EnvVars.hpp"
+#include <gtest/gtest.h>
+
+namespace RcclUnitTesting
+{
+  // This class facilitates testing RCCL collectives across various process / device configurations
+  //
+  class TestBed
+  {
+  public:
+    int                        numDevicesAvailable;   // # of devices detected on node
+    std::vector<TestBedChild*> childList;             // List of child processes
+    std::vector<int>           rankToChildMap;        // Tracks which child process each rank is assigned to
+    std::vector<int>           rankToDeviceMap;       // Tracks which device each rank is assigned to
+    int                        numActiveChildren;     // List of active children (with usable RCCL comms)
+    int                        numActiveRanks;        // Current # of ranks in use
+    int                        numCollectivesInGroup; // # of collectives to execute per group call
+
+    EnvVars ev;                                       // Environment variables
+
+    // Constructor - Creates one child process per detected GPU device that waits for further commands
+    TestBed();
+
+    // Prepare TestBed for use with GPUs across multiple child processes
+    void InitComms(std::vector<std::vector<int>> const& deviceIdsPerChild, int const numCollectivesInGroup = 1);
+    // Prepare TestBed for use with GPUs on a single child process
+    void InitComms(int const numGpus, int const numCollectivesInGroup = 1);
+
+    // Set collectives arguments for specified collective / rank
+    // Setting scalarsPerRank to non-null will create custom reduction operator
+    // Using collId = -1 (default) applies settings to all collectives in group
+    // Using rank = -1 (default) applies settings to all ranks
+
+    void SetCollectiveArgs(ncclFunc_t     const funcType,
+                           ncclDataType_t const dataType,
+                           ncclRedOp_t    const redOp,
+                           int            const root,
+                           size_t         const numInputElements,
+                           size_t         const numOutputElements,
+                           int            const collId = -1,
+                           int            const rank = -1,
+                           PtrUnion       const scalarsPerRank = {nullptr},
+                           int            const scalarMode = -1);
+
+    // Allocate memory for specified collective / rank
+    // - Requires SetCollectiveArgs to have been called already
+    // Using collId = -1 (default) applies settings to all collectives in group
+    // Using rank = -1 (default) applies settings to all ranks
+    void AllocateMem(bool   const inPlace = false,
+                     bool   const useManagedMem = false,
+                     int    const collId = -1,
+                     int    const rank = -1);
+
+    // Initialize input and compute expected results
+    // - requires that SetCollectiveArgs and AllocateMemory have already been called
+    // Setting collId to -1 applies settings to all collectives in group
+    // Setting rank to -1 applies settings to all ranks
+    // Setting prepDataFunc to nullptr uses the default fill pattern routine
+    void PrepareData(int const collId = -1,
+                     int const rank = -1,
+                     CollFuncPtr const prepDataFunc = nullptr);
+
+    // Execute all collectives on all test children
+    // Blocks until collective is completed
+    void ExecuteCollectives();
+
+    // Perform results validation - compare output to expected
+    void ValidateResults(bool& isCorrect, int collId = -1, int const rank = -1);
+
+    // Release allocated memory
+    void DeallocateMem(int collId = -1, int const rank = -1);
+
+    // Release the RCCL comms
+    void DestroyComms();
+
+    // Explicit TestBed destructor that releases all child processes
+    // No further calls to TestBed should be performed after this call
+    void Finalize();
+
+    // Destructor - Calls Finalize() to release all child processes
+    ~TestBed();
+
+    // Returns all the supported reduction operations based on build settings
+    std::vector<ncclRedOp_t> const& GetAllSupportedRedOps();
+
+    // Return all the supported data types based on build settings
+    std::vector<ncclDataType_t> const& GetAllSupportedDataTypes();
+
+    // Helper function that splits up GPUs to the given number of processes
+    static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
+                                                          int const numGpus);
+
+    // Generate a test case name
+    static std::string GetTestCaseName(int            const totalRanks,
+                                       bool           const isMultiProcess,
+                                       ncclFunc_t     const funcType,
+                                       ncclDataType_t const dataType,
+                                       ncclRedOp_t    const redOp,
+                                       int            const root,
+                                       bool           const inPlace,
+                                       bool           const managedMem);
+
+    // Run a simple sweep
+    void RunSimpleSweep(std::vector<ncclFunc_t>     const& funcTypes,
+                        std::vector<ncclDataType_t> const& dataTypes,
+                        std::vector<ncclRedOp_t>    const& redOps,
+                        std::vector<int>            const& roots,
+                        std::vector<int>            const& numElements,
+                        std::vector<bool>           const& inPlaceList,
+                        std::vector<bool>           const& managedMemList);
+
+    // Used to track total number of calls to ExecuteCollectives()
+    static int& NumTestsRun();
+
+  protected:
+    // Ends the specified child process
+    void StopChild(int const childId);
+  };
+}
@@ -0,0 +1,589 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "TestBedChild.hpp"
+#include <thread>
+
+#define CHILD_NCCL_CALL(cmd, msg)                                       \
+  {                                                                     \
+    if (this->verbose) printf("[ NCCL CALL] " #cmd "\n");               \
+    ncclResult_t status = cmd;                                          \
+    if (status != ncclSuccess)                                          \
+    {                                                                   \
+      ERROR("Child process %d fails NCCL call %s with code %d\n", this->childId, msg, status); \
+      return TEST_FAIL;                                                 \
+    }                                                                   \
+  }
+
+#define PIPE_READ(val) \
+  if (read(childReadFd, &val, sizeof(val)) != sizeof(val)) return TEST_FAIL;
+
+namespace RcclUnitTesting
+{
+  TestBedChild::TestBedChild(int const childId, bool const verbose, int const printValues)
+  {
+    this->childId = childId;
+    this->verbose = verbose;
+    this->printValues = printValues;
+  }
+
+  int TestBedChild::InitPipes()
+  {
+    // Prepare parent->child pipe
+    int pipefd[2];
+    if (pipe(pipefd) == -1)
+    {
+      ERROR("Unable to create parent->child pipe for child %d\n", this->childId);
+      return TEST_FAIL;
+    }
+    this->childReadFd   = pipefd[0];
+    this->parentWriteFd = pipefd[1];
+
+    // Prepare child->parent pipe
+    this->parentReadFd = -1;
+    if (pipe(pipefd) == -1)
+    {
+      ERROR("Unable to create parent->child pipe for child %d\n", this->childId);
+      return TEST_FAIL;
+    }
+    this->parentReadFd = pipefd[0];
+    this->childWriteFd = pipefd[1];
+    return TEST_SUCCESS;
+  }
+
+  void TestBedChild::StartExecutionLoop()
+  {
+    // Close unused ends of pipes
+    close(this->parentWriteFd);
+    close(this->parentReadFd);
+
+    // Wait for commands from parent process
+    if (verbose) INFO("Child %d enters execution loop\n", this->childId);
+    int command;
+    while (read(childReadFd, &command, sizeof(command)) > 0)
+    {
+      if (verbose) INFO("Child %d received command [%s]:\n", this->childId, ChildCommandNames[command]);;
+      ErrCode status = TEST_SUCCESS;
+      switch(command)
+      {
+      case CHILD_INIT_COMMS      : status = InitComms();          break;
+      case CHILD_SET_COLL_ARGS   : status = SetCollectiveArgs();  break;
+      case CHILD_ALLOCATE_MEM    : status = AllocateMem();        break;
+      case CHILD_PREPARE_DATA    : status = PrepareData();        break;
+      case CHILD_EXECUTE_COLL    : status = ExecuteCollectives(); break;
+      case CHILD_VALIDATE_RESULTS: status = ValidateResults();    break;
+      case CHILD_DEALLOCATE_MEM  : status = DeallocateMem();      break;
+      case CHILD_DESTROY_COMMS   : status = DestroyComms();       break;
+      case CHILD_STOP            : status = Stop();               break;
+      default: exit(0);
+      }
+
+      // Send back acknowledgement to parent
+      if (status == TEST_FAIL)
+        ERROR("Child %d failed on command [%s]:\n", this->childId, ChildCommandNames[command]);
+      write(childWriteFd, &status, sizeof(status));
+    }
+
+    // Close child ends of pipe
+    close(this->childReadFd);
+    close(this->childWriteFd);
+
+    exit(0);
+  }
+
+  ErrCode TestBedChild::InitComms()
+  {
+    if (this->verbose) INFO("Child %d begins InitComms()\n", this->childId);
+
+    // Read values sent by parent [see TestBed::InitComms()]
+    PIPE_READ(this->totalRanks);
+    PIPE_READ(this->rankOffset);
+    PIPE_READ(this->numCollectivesInGroup);
+
+    // Read the GPUs this child uses and prepare storage for collective args / datasets
+    int numGpus;
+    PIPE_READ(numGpus);
+    this->deviceIds.resize(numGpus);
+    this->streams.resize(numGpus);
+    this->collArgs.resize(numGpus);
+    for (int i = 0; i < numGpus; i++)
+    {
+      PIPE_READ(this->deviceIds[i]);
+      this->collArgs[i].clear();
+      this->collArgs[i].resize(numCollectivesInGroup);
+    }
+
+    // Collect uniqueId (specified by NCCL_COMM_ID env var)
+    ncclUniqueId id;
+    CHILD_NCCL_CALL(ncclGetUniqueId(&id), "ncclGetUniqueId");
+
+    // Initialize communicators
+    comms.clear();
+    comms.resize(numGpus);
+
+    // Initialize within a group call to avoid deadlock when using multiple ranks per child
+    ErrCode status = TEST_SUCCESS;
+    CHILD_NCCL_CALL(ncclGroupStart(), "ncclGroupStart");
+    for (int localRank = 0; localRank < numGpus; ++localRank)
+    {
+      int const globalRank = this->rankOffset + localRank;
+      int const currGpu = this->deviceIds[localRank];
+
+      if (hipSetDevice(currGpu) != hipSuccess)
+      {
+        ERROR("Rank %d on child %d unable to switch to GPU %d\n", globalRank, this->childId, currGpu);
+        status = TEST_FAIL;
+        break;
+      }
+
+      if (hipStreamCreate(&this->streams[localRank]) != hipSuccess)
+      {
+        ERROR("Rank %d on child %d unable to create stream for GPU %d\n", globalRank, this->childId, currGpu);
+        status = TEST_FAIL;
+        break;
+      }
+
+      if (ncclCommInitRank(&this->comms[localRank], this->totalRanks, id, globalRank) != ncclSuccess)
+      {
+        ERROR("Rank %d on child %d unable to call ncclCommInitRank\n", globalRank, this->childId);
+        status = TEST_FAIL;
+        break;
+      }
+    }
+    if (status == TEST_SUCCESS)
+    {
+      CHILD_NCCL_CALL(ncclGroupEnd(), "ncclGroupStart");
+    }
+    if (this->verbose) INFO("Child %d finishes InitComms() [%s]\n",
+                            this->childId, status == TEST_SUCCESS ? "SUCCESS" : "FAIL");
+    return status;
+  }
+
+  ErrCode TestBedChild::SetCollectiveArgs()
+  {
+    if (this->verbose) INFO("Child %d begins SetCollectiveArgs()\n", this->childId);
+
+    // Read values sent by parent [see TestBed::SetCollectiveArgs()]
+    int             globalRank;
+    int             collId;
+    ncclFunc_t      funcType;
+    ncclDataType_t  dataType;
+    ncclRedOp_t     redOp;
+    int             root;
+    size_t          numInputElements;
+    size_t          numOutputElements;
+    ScalarTransport scalarTransport;
+    int             scalarMode;
+
+    PIPE_READ(globalRank);
+    PIPE_READ(collId);
+    PIPE_READ(funcType);
+    PIPE_READ(dataType);
+    PIPE_READ(redOp);
+    PIPE_READ(root);
+    PIPE_READ(numInputElements);
+    PIPE_READ(numOutputElements);
+    PIPE_READ(scalarMode);
+    PIPE_READ(scalarTransport);
+
+    for (int i = 0; i < this->totalRanks; i++)
+    {
+      PtrUnion scalarsPerRank;
+      scalarsPerRank.Attach(scalarTransport.ptr);
+    }
+
+    if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
+    {
+      ERROR("Child %d does not contain rank %d\n", this->childId, globalRank);
+      return TEST_FAIL;
+    }
+    int const localRank = globalRank - rankOffset;
+    CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
+
+    for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx)
+    {
+      if (collId == -1 || collId == collIdx)
+      {
+        CollectiveArgs& collArg = this->collArgs[localRank][collIdx];
+        CHECK_CALL(collArg.SetArgs(globalRank, this->totalRanks,
+                                   this->deviceIds[localRank],
+                                   funcType, dataType, redOp, root,
+                                   numInputElements, numOutputElements,
+                                   scalarTransport, scalarMode));
+        if (this->verbose) INFO("Rank %d on child %d sets collective %d [%s]\n",
+                                globalRank, this->childId, collIdx,
+                                collArg.GetDescription().c_str());
+
+        // If pre-mult scalars are provided, then create a custom reduction operator
+        if (scalarMode >= 0)
+        {
+          CHILD_NCCL_CALL(ncclRedOpCreatePreMulSum(&collArg.redOp,
+                                                   collArg.localScalar.ptr,
+                                                   dataType,
+                                                   (ncclScalarResidence_t)scalarMode,
+                                                   this->comms[localRank]),
+                          "ncclRedOpCreatePreMulSum");
+          if (verbose) INFO("Child %d created custom redop %d for collective %d\n",
+                            this->childId, collArg.redOp, collIdx);
+        }
+      }
+    }
+    if (this->verbose) INFO("Child %d finishes SetCollectiveArgs()\n", this->childId);
+    return TEST_SUCCESS;
+  }
+
+  ErrCode TestBedChild::AllocateMem()
+  {
+    if (this->verbose) INFO("Child %d begins AllocateMem()\n", this->childId);
+
+    // Read values sent by parent [see TestBed::AllocateMem()]
+    int    globalRank;
+    int    collId;
+    bool   inPlace;
+    bool   useManagedMem;
+
+    PIPE_READ(globalRank);
+    PIPE_READ(collId);
+    PIPE_READ(inPlace);
+    PIPE_READ(useManagedMem);
+
+    if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
+    {
+      ERROR("Child %d does not contain rank %d\n", this->childId, globalRank);
+      return TEST_FAIL;
+    }
+    int const localRank = globalRank - rankOffset;
+    CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
+
+    for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx)
+    {
+      if (collId == -1 || collId == collIdx)
+      {
+	CollectiveArgs& collArg = this->collArgs[localRank][collIdx];
+        CHECK_CALL(collArg.AllocateMem(inPlace, useManagedMem));
+        if (this->verbose) INFO("Rank %d on child %d allocates memory for collective %d on device %d (%s,%s) Input: %p Output %p\n",
+                                globalRank, this->childId, collIdx, this->deviceIds[localRank],
+                                inPlace ? "in-place" : "out-of-place",
+                                useManagedMem ? "managed" : "unmanaged",
+                                collArg.inputGpu.ptr,
+                                collArg.outputGpu.ptr);
+      }
+    }
+
+    if (this->verbose) INFO("Child %d finishes AllocateMem()\n", this->childId);
+    return TEST_SUCCESS;
+  }
+
+  // Fill input memory with pre-known patterned based on rank
+  ErrCode TestBedChild::PrepareData()
+  {
+    if (this->verbose) INFO("Child %d begins PrepareData()\n", this->childId);
+
+    // Read values sent by parent [see TestBed::PrepareData()]
+    int globalRank;
+    int collId;
+    CollFuncPtr prepDataFunc;
+
+    PIPE_READ(globalRank);
+    PIPE_READ(collId);
+    PIPE_READ(prepDataFunc);
+
+    if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
+    {
+      ERROR("Child %d does not contain rank %d\n", this->childId, globalRank);
+      return TEST_FAIL;
+    }
+
+    int const localRank = globalRank - rankOffset;
+    CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
+
+    for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx)
+    {
+      if (collId == -1 || collId == collIdx)
+      {
+        if (this->verbose) INFO("Rank %d on child %d prepares data for collective %d\n",
+                                globalRank, this->childId, collIdx);
+        CHECK_CALL(this->collArgs[localRank][collIdx].PrepareData(prepDataFunc));
+      }
+    }
+    if (this->verbose) INFO("Child %d finishes PrepareData()\n", this->childId);
+    return TEST_SUCCESS;
+  }
+
+  ErrCode TestBedChild::ExecuteCollectives()
+  {
+    if (this->verbose) INFO("Child %d begins ExecuteCollectives()\n", this->childId);
+
+    // Start group call
+    CHILD_NCCL_CALL(ncclGroupStart(), "ncclGroupStart");
+
+    // Loop over all collectives to be executed in group call
+    for (int collId = 0; collId < this->numCollectivesInGroup; ++collId)
+    {
+      // Loop over all local ranks
+      for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank)
+      {
+        CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
+
+        CollectiveArgs const& collArg = this->collArgs[localRank][collId];
+
+        if (this->printValues)
+        {
+          int const numInputElementsToPrint = (this->printValues < 0 ? collArg.numInputElements : this->printValues);
+          PtrUnion inputCpu;
+          size_t const numInputBytes = numInputElementsToPrint * DataTypeToBytes(collArg.dataType);
+          inputCpu.AllocateCpuMem(numInputBytes);
+          CHECK_HIP(hipMemcpy(inputCpu.ptr, collArg.inputGpu.ptr, numInputBytes, hipMemcpyDeviceToHost));
+          printf("[ DEBUG    ] Rank %02d Coll %d %-10s: %s\n", collArg.globalRank, collId, "Input",
+                 inputCpu.ToString(collArg.dataType, numInputElementsToPrint).c_str());
+          inputCpu.FreeCpuMem();
+
+          int const numOutputElementsToPrint = (this->printValues < 0 ? collArg.numOutputElements : this->printValues);
+          size_t const numOutputBytes = numOutputElementsToPrint * DataTypeToBytes(collArg.dataType);
+          CHECK_HIP(hipMemcpy(collArg.outputCpu.ptr, collArg.outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost));
+          printf("[ DEBUG    ] Rank %02d Coll %d %-10s: %s\n", collArg.globalRank, collId, "Pre-Output",
+                 collArg.outputCpu.ToString(collArg.dataType, numOutputElementsToPrint).c_str());
+        }
+
+        switch (collArg.funcType)
+        {
+        case ncclCollBroadcast:
+          CHILD_NCCL_CALL(ncclBroadcast(collArg.inputGpu.ptr,
+                                        collArg.outputGpu.ptr,
+                                        collArg.numInputElements,
+                                        collArg.dataType,
+                                        collArg.root,
+                                        this->comms[localRank],
+                                        this->streams[localRank]),
+                          "ncclBroadcast");
+          break;
+        case ncclCollReduce:
+          CHILD_NCCL_CALL(ncclReduce(collArg.inputGpu.ptr,
+                                     collArg.outputGpu.ptr,
+                                     collArg.numInputElements,
+                                     collArg.dataType,
+                                     collArg.redOp,
+                                     collArg.root,
+                                     this->comms[localRank],
+                                     this->streams[localRank]),
+                          "ncclReduce");
+          break;
+        case ncclCollAllGather:
+          CHILD_NCCL_CALL(ncclAllGather(collArg.inputGpu.ptr,
+                                        collArg.outputGpu.ptr,
+                                        collArg.numInputElements,
+                                        collArg.dataType,
+                                        this->comms[localRank],
+                                        this->streams[localRank]),
+                          "ncclAllGather");
+          break;
+        case ncclCollReduceScatter:
+          CHILD_NCCL_CALL(ncclReduceScatter(collArg.inputGpu.ptr,
+                                            collArg.outputGpu.ptr,
+                                            collArg.numOutputElements,
+                                            collArg.dataType,
+                                            collArg.redOp,
+                                            this->comms[localRank],
+                                            this->streams[localRank]),
+                          "ncclReduceScatter");
+          break;
+        case ncclCollAllReduce:
+          CHILD_NCCL_CALL(ncclAllReduce(collArg.inputGpu.ptr,
+                                        collArg.outputGpu.ptr,
+                                        collArg.numInputElements,
+                                        collArg.dataType,
+                                        collArg.redOp,
+                                        this->comms[localRank],
+                                        this->streams[localRank]),
+                          "ncclAllReduce");
+          break;
+        case ncclCollGather:
+          CHILD_NCCL_CALL(ncclGather(collArg.inputGpu.ptr,
+                                     collArg.outputGpu.ptr,
+                                     collArg.numInputElements,
+                                     collArg.dataType,
+                                     collArg.root,
+                                     this->comms[localRank],
+                                     this->streams[localRank]),
+                          "ncclGather");
+          break;
+        case ncclCollScatter:
+          CHILD_NCCL_CALL(ncclScatter(collArg.inputGpu.ptr,
+                                      collArg.outputGpu.ptr,
+                                      collArg.numOutputElements,
+                                      collArg.dataType,
+                                      collArg.root,
+                                      this->comms[localRank],
+                                      this->streams[localRank]),
+                          "ncclScatter");
+          break;
+        case ncclCollAllToAll:
+          CHILD_NCCL_CALL(ncclAllToAll(collArg.inputGpu.ptr,
+                                       collArg.outputGpu.ptr,
+                                       collArg.numInputElements / collArg.totalRanks,
+                                       collArg.dataType,
+                                       this->comms[localRank],
+                                       this->streams[localRank]),
+                          "ncclAllToAll");
+          break;
+        case ncclCollSend:
+          CHILD_NCCL_CALL(ncclSend(collArg.inputGpu.ptr,
+                                   collArg.numInputElements,
+                                   collArg.dataType,
+                                   collArg.root,
+                                   this->comms[localRank],
+                                   this->streams[localRank]),
+                          "ncclSend");
+          break;
+        case ncclCollRecv:
+          CHILD_NCCL_CALL(ncclRecv(collArg.outputGpu.ptr,
+                                   collArg.numOutputElements,
+                                   collArg.dataType,
+                                   collArg.root,
+                                   this->comms[localRank],
+                                   this->streams[localRank]),
+                          "ncclRecv");
+          break;
+        default:
+          ERROR("Unknown func type %d\n", collArg.funcType);
+          return TEST_FAIL;
+        }
+      }
+    }
+
+    // End group call
+    CHILD_NCCL_CALL(ncclGroupEnd(), "ncclGroupEnd");
+
+    // Synchronize
+    if (this->verbose) INFO("Child %d submits group call.  Waiting for completion\n", this->childId);
+    for (int localRank = 0; localRank < this->streams.size(); ++localRank)
+    {
+      CHECK_HIP(hipStreamSynchronize(this->streams[localRank]));
+    }
+
+    if (this->printValues)
+    {
+      for (int collId = 0; collId < this->numCollectivesInGroup; ++collId)
+        for (int localRank = 0; localRank < this->deviceIds.size(); ++localRank)
+        {
+          CollectiveArgs const& collArg = this->collArgs[localRank][collId];
+
+          int numOutputElementsToPrint = (this->printValues < 0 ? collArg.numOutputElements : this->printValues);
+          size_t const numOutputBytes = numOutputElementsToPrint * DataTypeToBytes(collArg.dataType);
+          CHECK_HIP(hipMemcpy(collArg.outputCpu.ptr, collArg.outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost));
+          printf("[ DEBUG    ] Rank %02d Coll %d %-10s: %s\n", collArg.globalRank, collId, "Output",
+                 collArg.outputCpu.ToString(collArg.dataType, numOutputElementsToPrint).c_str());
+
+          printf("[ DEBUG    ] Rank %02d Coll %d %-10s: %s\n", collArg.globalRank, collId, "Expected",
+                 collArg.expected.ToString(collArg.dataType, numOutputElementsToPrint).c_str());
+        }
+    }
+    if (this->verbose) INFO("Child %d finishes ExecuteCollectives()\n", this->childId);
+    return TEST_SUCCESS;
+  }
+
+  ErrCode TestBedChild::ValidateResults()
+  {
+    // Read values sent by parent [see TestBed::ValidateResults()]
+    int globalRank, collId;
+    PIPE_READ(globalRank);
+    PIPE_READ(collId);
+
+    if (this->verbose) INFO("Child %d begins ValidateResults()\n", this->childId);
+
+    if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
+    {
+      ERROR("Child %d does not contain rank %d\n", this->childId, globalRank);
+      return TEST_FAIL;
+    }
+    int const localRank = globalRank - rankOffset;
+    CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
+
+    ErrCode status = TEST_SUCCESS;
+    for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx)
+    {
+      if (collId == -1 || collId == collIdx)
+      {
+        if (this->verbose) INFO("Rank %d on child %d validating collective %d results\n",
+                                globalRank, this->childId, collIdx);
+        if (this->collArgs[localRank][collIdx].ValidateResults() != TEST_SUCCESS)
+        {
+          ERROR("Rank %d Collective %d output does not match expected\n", globalRank, collIdx);
+          status = TEST_FAIL;
+        }
+      }
+    }
+    if (this->verbose) INFO("Child %d finishes ValidateResults() with status %s\n", this->childId,
+                            status == TEST_SUCCESS ? "SUCCESS" : "FAIL");
+    return status;
+  }
+
+  ErrCode TestBedChild::DeallocateMem()
+  {
+    if (this->verbose) INFO("Child %d begins DeallocateMem\n", this->childId);
+
+    // Read values sent by parent [see TestBed::DeallocateMem()]
+    int globalRank, collId;
+    PIPE_READ(globalRank);
+    PIPE_READ(collId);
+
+    if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
+    {
+      ERROR("Child %d does not contain rank %d\n", this->childId, globalRank);
+      return TEST_FAIL;
+    }
+    int const localRank = globalRank - rankOffset;
+    CHECK_HIP(hipSetDevice(this->deviceIds[localRank]));
+
+    for (int collIdx = 0; collIdx < collArgs[localRank].size(); ++collIdx)
+    {
+      CollectiveArgs& collArg = this->collArgs[localRank][collIdx];
+      if (collId == -1 || collId == collIdx)
+      {
+        if (this->verbose)
+        {
+          INFO("Child %d release memory for collective %d (Input: %p Output %p\n",
+               this->childId, collIdx, collArg.inputGpu.ptr, collArg.outputGpu.ptr);
+        }
+
+        CHECK_CALL(collArg.DeallocateMem());
+      }
+      if (collArg.scalarMode != -1)
+      {
+        CHILD_NCCL_CALL(ncclRedOpDestroy(collArg.redOp, this->comms[localRank]),
+                        "ncclRedOpDestroy");
+        if (verbose) INFO("Child %d destroys custom redop %d for collective %d\n",
+                          this->childId, collArg.redOp, collIdx);
+      }
+    }
+    if (this->verbose) INFO("Child %d finishes DeallocateMem\n", this->childId);
+    return TEST_SUCCESS;
+  }
+
+  ErrCode TestBedChild::DestroyComms()
+  {
+    if (this->verbose) INFO("Child %d begins DestroyComms\n", this->childId);
+
+    // Release comms
+    for (int i = 0; i < this->comms.size(); ++i)
+    {
+      CHILD_NCCL_CALL(ncclCommDestroy(this->comms[i]), "ncclCommDestroy");
+    }
+    for (int i = 0; i < this->streams.size(); ++i)
+    {
+      CHECK_HIP(hipStreamDestroy(this->streams[i]));
+    }
+    this->comms.clear();
+    this->streams.clear();
+    if (this->verbose) INFO("Child %d finishes DestroyComms\n", this->childId);
+    return TEST_SUCCESS;
+  }
+
+  ErrCode TestBedChild::Stop()
+  {
+    return TEST_SUCCESS;
+  }
+}
@@ -0,0 +1,106 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#pragma once
+
+#include <vector>
+#include <unistd.h>
+#include "CollectiveArgs.hpp"
+#include "rccl.h"
+
+#define MAX_RANKS 32
+namespace RcclUnitTesting
+{
+  class TestBedChild
+  {
+  public:
+    // These are commands that can be given to the child process
+    enum
+    {
+      CHILD_INIT_COMMS       = 0,  // InitComms()
+      CHILD_SET_COLL_ARGS    = 1,  // SetCollectiveArgs()
+      CHILD_ALLOCATE_MEM     = 2,  // AllocateMem()
+      CHILD_PREPARE_DATA     = 3,  // PrepareData()
+      CHILD_EXECUTE_COLL     = 4,  // ExecuteCollectives()
+      CHILD_VALIDATE_RESULTS = 5,  // ValidateResults()
+      CHILD_DEALLOCATE_MEM   = 6,  // DeallocateMem()
+      CHILD_DESTROY_COMMS    = 7,  // DestroyComms()
+      CHILD_STOP             = 8,  // Stop()
+      NUM_CHILD_COMMANDS     = 9
+    };
+
+    char const ChildCommandNames[NUM_CHILD_COMMANDS][20] =
+    {
+      "INIT_COMMS",
+      "SET_COLL_ARGS",
+      "ALLOCATE_MEM",
+      "PREPARE_DATA",
+      "EXECUTE_COLL",
+      "VALIDATE_RESULTS",
+      "DEALLOCATE_MEM",
+      "DESTROY_COMMS",
+      "STOP"
+    };
+
+    // These variables remain constant for life of TestBedChild
+    int   childId;
+    pid_t pid;
+    bool  verbose;
+    int   printValues;
+
+    // Pipes used to communicate between parent process
+    int parentWriteFd;
+    int parentReadFd;
+    int childWriteFd;
+    int childReadFd;
+
+    // These varibles may change based on commands issued by parent
+    int totalRanks;                                     // Total ranks
+    int rankOffset;                                     // Global rank offset for this child
+    int numCollectivesInGroup;                          // # of collectives to run per group call
+    std::vector<ncclComm_t> comms;                      // RCCL communicators for each rank
+    std::vector<int> deviceIds;                         // Device IDs for each rank
+    std::vector<hipStream_t> streams;                   // Streams for executing collectives
+    std::vector<std::vector<CollectiveArgs>> collArgs;  // Info for each collective for each rank
+
+    // Constructor
+    TestBedChild(int const childId, bool const verbose, int const printValues);
+
+    // Prepare parent/child communication pipes - to be executed by parent process
+    int InitPipes();
+
+    // Execution
+    void StartExecutionLoop();
+
+  protected:
+    // Initialize RCCL communicators
+    ErrCode InitComms();
+
+    // Set CollectiveArgs
+    ErrCode SetCollectiveArgs();
+
+    // Allocate memory (input (GPU) / output (GPU) / expected (CPU))
+    ErrCode AllocateMem();
+
+    // Prepare input and expected data
+    ErrCode PrepareData();
+
+    // Execute a group of collectives
+    ErrCode ExecuteCollectives();
+
+    // Validate that output matches expected
+    ErrCode ValidateResults();
+
+    // Release allocated memory
+    ErrCode DeallocateMem();
+
+    // Destroys RCCL communicators
+    ErrCode DestroyComms();
+
+    // Stops this child process
+    ErrCode Stop();
+  };
+}
@@ -0,0 +1,11 @@
+#include <gtest/gtest.h>
+#include "EnvVars.hpp"
+#include "TestBed.hpp"
+int main(int argc, char **argv)
+{
+  ::testing::InitGoogleTest(&argc, argv);
+  RcclUnitTesting::EnvVars::ShowConfig();
+  int retCode = RUN_ALL_TESTS();
+  printf("[ INFO     ] Total executed cases: %d\n", RcclUnitTesting::TestBed::NumTestsRun());
+  return retCode;
+}
@@ -1,117 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#include "test_AllGather.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(AllGatherCorrectnessTest, Correctness)
-    {
-        // Adjust numElements to be multiple of numDevices
-        numElements = (numElements/numDevices)*numDevices;
-        if (numDevices > numDevicesAvailable) return;
-        if (numElements % numDevices != 0) return;
-
-        // Prepare input / output / expected results
-        Dataset dataset;
-        dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllGather);
-        FillDatasetWithPattern(dataset);
-        ComputeExpectedResults(dataset);
-
-        size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
-        size_t const sendCount = dataset.numElements / dataset.numDevices;
-
-        // Launch the reduction (1 thread per GPU)
-        ncclGroupStart();
-        for (int i = 0; i < numDevices; i++)
-        {
-            ncclAllGather((int8_t *)dataset.inputs[i] + (i * byteCount),
-                          dataset.outputs[i], sendCount,
-                          dataType, comms[i], streams[i]);
-        }
-        ncclGroupEnd();
-
-        // Wait for reduction to complete
-        Synchronize();
-
-        // Check results
-        ValidateResults(dataset);
-        dataset.Release();
-    }
-
-    TEST_P(AllGatherCorrectnessTest, Alignment)
-    {
-        if (numDevices > numDevicesAvailable) return;
-        if (numElements % numDevices != 0) return;
-
-        // Allocate dataset
-        Dataset dataset;
-        dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllGather);
-
-        // Loop over several offsets (so that device pointers are not aligned)
-        for (int firstElement = 1; firstElement <= 11; firstElement += 2)
-        {
-            if (firstElement < numElements)
-            {
-                // Select last element so that total number of elements is multiple of numDevices
-                int const lastElement = firstElement + ((numElements - firstElement) / numDevices) * numDevices - 1;
-                if (lastElement >= numElements) break;
-
-                Dataset subDataset;
-                dataset.ExtractSubDataset(firstElement, lastElement, subDataset);
-
-                // Compute reference results for sub-dataset
-                FillDatasetWithPattern(subDataset);
-                ComputeExpectedResults(subDataset);
-
-                size_t const byteCount = subDataset.NumBytes() / subDataset.numDevices;
-                size_t const sendCount = subDataset.numElements / subDataset.numDevices;
-
-                // Launch the reduction (1 thread per GPU)
-                ncclGroupStart();
-                for (int i = 0; i < numDevices; i++)
-                {
-                    ncclAllGather((int8_t *)subDataset.inputs[i] + (i * byteCount),
-                                  subDataset.outputs[i], sendCount,
-                                  dataType, comms[i], streams[i]);
-                }
-                ncclGroupEnd();
-
-                // Wait for reduction to complete
-                Synchronize();
-
-                // Check results
-                ValidateResults(subDataset);
-            }
-        }
-        dataset.Release();
-    }
-
-
-    INSTANTIATE_TEST_SUITE_P(AllGatherCorrectnessSweep,
-                            AllGatherCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator (not used)
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(2520, 3026520),
-                                // Number of devices
-                                testing::Range(2,(GTESTS_NUM_GPUS+1)),
-                                // In-place or not
-                                testing::Values(false, true),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,34 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_ALLGATHER_HPP
-#define TEST_ALLGATHER_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class AllGatherCorrectnessTest : public CorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset)
-        {
-            size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
-
-            int8_t* result = (int8_t *)malloc(dataset.NumBytes());
-
-            for (int i = 0; i < dataset.numDevices; i++)
-                HIP_CALL(hipMemcpy(result + i * byteCount, (int8_t *)dataset.inputs[i] + (i * byteCount),
-                                   byteCount, hipMemcpyDeviceToHost));
-
-            for (int i = 0; i < dataset.numDevices; i++)
-                memcpy(dataset.expected[i], result, dataset.NumBytes());
-
-            free(result);
-        }
-    };
-}
-
-#endif
@@ -1,60 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#include "test_AllGatherMultiProcess.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(AllGatherMultiProcessCorrectnessTest, Correctness)
-    {
-        dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllGather);
-        std::vector<int> pids(numDevices);
-
-        int gpu = -1;
-        for (int i = 0; i < numDevices; i++)
-        {
-            gpu++;
-            int pid = fork();
-            if (pid == 0)
-            {
-                bool pass;
-                TestAllGather(gpu, *dataset, pass);
-                TerminateChildProcess(pass);
-            }
-            else
-            {
-                pids[gpu] = pid;
-            }
-        }
-
-        ValidateProcesses(pids);
-        dataset->ReleaseRootProcess();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(AllGatherMultiProcessCorrectnessSweep,
-                            AllGatherMultiProcessCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator (not used)
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(3072, 3145728),
-                                // Number of devices
-                                testing::Values(2,3,4,8),
-                                // In-place or not
-                                testing::Values(false, true),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,81 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_ALLGATHER_MULTI_PROCESS_HPP
-#define TEST_ALLGATHER_MULTI_PROCESS_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class AllGatherMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, int const numDevices, std::vector<int> const& ranks)
-        {
-            size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
-
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                int rank = ranks[i];
-                HIP_CALL(hipMemcpy(static_cast<char*>(dataset.expected[0]) + rank * byteCount, (int8_t *)dataset.inputs[rank] + (rank * byteCount),
-                                   byteCount, hipMemcpyDeviceToHost));
-            }
-            barrier.Wait();
-
-            // Rank 0 sends answer to other ranks
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                int rank = ranks[i];
-                if (rank == 0)
-                {
-                    for (int i = 0; i < dataset.numDevices; i++)
-                    {
-                        if (i == rank) continue;
-                        memcpy(dataset.expected[i], dataset.expected[0], dataset.NumBytes());
-                    }
-                }
-            }
-        }
-
-        void TestAllGather(int rank, Dataset& dataset, bool& pass)
-        {
-            // Prepare input / output / expected results
-            SetUpPerProcess(rank, ncclCollAllGather, comms[rank], streams[rank], dataset);
-
-            if (numDevices > numDevicesAvailable || numElements % numDevices != 0)
-            {
-                pass = true;
-                return;
-            }
-
-            Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
-
-            // Prepare input / output / expected results
-            FillDatasetWithPattern(dataset, rank);
-
-            ComputeExpectedResults(dataset, barrier, numDevices, std::vector<int>(1, rank));
-
-            size_t const byteCount = dataset.NumBytes() / numDevices;
-            size_t const sendCount = dataset.numElements / numDevices;
-
-            // Launch the reduction (1 process per GPU)
-            ncclAllGather((int8_t *)dataset.inputs[rank] + (rank * byteCount),
-                          dataset.outputs[rank], sendCount,
-                          dataType, comms[rank], streams[rank]);
-
-            // Wait for reduction to complete
-            HIP_CALL(hipStreamSynchronize(streams[rank]));
-
-            // Check results
-            pass = ValidateResults(dataset, rank);
-
-            TearDownPerProcess(comms[rank], streams[rank]);
-            dataset.Release(rank);
-        }
-    };
-}
-
-#endif
@@ -1,80 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_AllReduce.hpp"
-
-namespace CorrectnessTests
-{
-  TEST_P(AllReduceCorrectnessTest, Correctness)
-  {
-    if (numDevices > numDevicesAvailable) return;
-
-    // Prepare input / output / expected results
-    Dataset dataset;
-    dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
-    FillDatasetWithPattern(dataset);
-    ComputeExpectedResults(dataset, op);
-
-    // Launch the reduction (1 thread per GPU)
-    ncclGroupStart();
-    for (int i = 0; i < numDevices; i++)
-    {
-      ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
-                    numElements, dataType, op, comms[i], streams[i]);
-    }
-    ncclGroupEnd();
-
-    // Wait for reduction to complete
-    Synchronize();
-
-    // Check results
-    ValidateResults(dataset);
-
-    dataset.Release();
-  }
-#if defined(BUILD_ALLREDUCE_ONLY)
-  INSTANTIATE_TEST_SUITE_P(AllReduceCorrectnessSweep,
-                           AllReduceCorrectnessTest,
-                           testing::Combine(
-                             // Reduction operator
-                             testing::Values(ncclSum),
-                             // Data types
-                             testing::Values(ncclFloat32),
-                             // Number of elements
-                             testing::Values(1024, 1048576),
-                             // Number of devices
-                             testing::Range(2,(GTESTS_NUM_GPUS+1)),
-                             // In-place or not
-                             testing::Values(false, true),
-                             testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
-                           CorrectnessTest::PrintToStringParamName());
-#else
-  INSTANTIATE_TEST_SUITE_P(AllReduceCorrectnessSweep,
-                           AllReduceCorrectnessTest,
-                           testing::Combine(
-                             // Reduction operator
-                             testing::Values(ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg),
-                             // Data types
-                             testing::Values(ncclInt8,
-                                             ncclUint8,
-                                             ncclInt32,
-                                             ncclUint32,
-                                             ncclInt64,
-                                             ncclUint64,
-                                             //ncclFloat16,
-                                             ncclFloat32,
-                                             ncclFloat64,
-                                             ncclBfloat16),
-                             // Number of elements
-                             testing::Values(1024, 1048576),
-                             // Number of devices
-                             testing::Range(2,(GTESTS_NUM_GPUS+1)),
-                             // In-place or not
-                             testing::Values(false, true),
-                             testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
-                           CorrectnessTest::PrintToStringParamName());
-#endif
-} // namespace
@@ -1,83 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_ALLREDUCE_HPP
-#define TEST_ALLREDUCE_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class AllReduceCorrectnessTest : public CorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op)
-        {
-            // Copy all inputs to expected arrays temporarily to perform reduction on host
-            for (int i = 0; i < dataset.numDevices; i++)
-                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
-                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
-
-            // Allocate temporary host array to accumulate results
-            int8_t*   resultI1 = (int8_t   *)malloc(dataset.NumBytes());
-            uint8_t*  resultU1 = (uint8_t  *)resultI1;
-            int32_t*  resultI4 = (int32_t  *)resultI1;
-            uint32_t* resultU4 = (uint32_t *)resultI1;
-            int64_t*  resultI8 = (int64_t  *)resultI1;
-            uint64_t* resultU8 = (uint64_t *)resultI1;
-            float*    resultF4 = (float    *)resultI1;
-            double*   resultF8 = (double   *)resultI1;
-            rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
-
-            // Initialize the result with the first device's array
-            memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
-            ncclRedOp_t red_op = ((op == ncclAvg) ? ncclSum : op);
-
-            // Perform reduction on the other device arrays
-            for (int i = 1; i < dataset.numDevices; i++)
-            {
-                int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
-                uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
-                int32_t*  arrayI4 = (int32_t  *)arrayI1;
-                uint32_t* arrayU4 = (uint32_t *)arrayI1;
-                int64_t*  arrayI8 = (int64_t  *)arrayI1;
-                uint64_t* arrayU8 = (uint64_t *)arrayI1;
-                float*    arrayF4 = (float    *)arrayI1;
-                double*   arrayF8 = (double   *)arrayI1;
-                rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
-
-                for (int j = 0; j < dataset.numElements; j++)
-                {
-                    switch (dataset.dataType)
-                    {
-                    case ncclInt8:    resultI1[j] = ReduceOp(red_op, resultI1[j], arrayI1[j]); break;
-                    case ncclUint8:   resultU1[j] = ReduceOp(red_op, resultU1[j], arrayU1[j]); break;
-                    case ncclInt32:   resultI4[j] = ReduceOp(red_op, resultI4[j], arrayI4[j]); break;
-                    case ncclUint32:  resultU4[j] = ReduceOp(red_op, resultU4[j], arrayU4[j]); break;
-                    case ncclInt64:   resultI8[j] = ReduceOp(red_op, resultI8[j], arrayI8[j]); break;
-                    case ncclUint64:  resultU8[j] = ReduceOp(red_op, resultU8[j], arrayU8[j]); break;
-                    case ncclFloat32: resultF4[j] = ReduceOp(red_op, resultF4[j], arrayF4[j]); break;
-                    case ncclFloat64: resultF8[j] = ReduceOp(red_op, resultF8[j], arrayF8[j]); break;
-                    case ncclBfloat16: resultB2[j] = ReduceOp(red_op, resultB2[j], arrayB2[j]); break;
-                    default:
-                        fprintf(stderr, "[ERROR] Unsupported datatype\n");
-                        exit(0);
-                    }
-                }
-            }
-
-            if (op == ncclAvg)
-                Average(dataset, resultI1);
-
-            // Copy results into expected arrays
-            for (int i = 0; i < dataset.numDevices; i++)
-                memcpy(dataset.expected[i], resultI1, dataset.NumBytes());
-
-            free(resultI1);
-        }
-    };
-}
-
-#endif
@@ -1,138 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_AllReduceAbort.hpp"
-#include "../include/comm.h"
-
-#define NUM_ITER 8
-#define FAKE_OP_COUNT NUM_ITER+1
-
-namespace CorrectnessTests
-{
-    #define HIPCHECK(cmd)                                                          \
-    do {                                                                           \
-      hipError_t error = (cmd);                                                    \
-      if (error != hipSuccess) {                                                   \
-        std::cerr << "Encountered HIP error (" << error << ") at line "            \
-                  << __LINE__ << " in file " << __FILE__ << "\n";                  \
-        exit(-1);                                                                  \
-      }                                                                            \
-    } while (0)
-
-    #define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
-    #define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
-
-    TEST_P(AllReduceAbortTest, Correctness) {
-        if (numDevices > numDevicesAvailable) return;
-
-        // Prepare input / output / expected results
-        Dataset dataset;
-        dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
-        FillDatasetWithPattern(dataset);
-
-        int gpu = 0; // GPU number to trigger abort
-        ncclComm_t comm = comms[gpu];
-
-        HIPCHECK(hipSetDevice(gpu));
-        hipStream_t stream;
-        HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-        struct ncclChannel* channel = comm->channels;
-        uint64_t **p_dev_head = (uint64_t **)((uint8_t*)(channel->devPeers + channel->ring.next) + offsetof(struct ncclPeer, send[0].conn.head));
-        uint64_t *real_head, *fake_head, *fake_h;
-
-        // get original head
-        HIPCHECK(hipMemcpy(&real_head, p_dev_head, sizeof(uint64_t*), hipMemcpyDefault));
-        // allocate and install fakes
-        HIPCHECK(hipHostMalloc(&fake_head, sizeof(uint64_t*), hipHostMallocMapped));
-        HIPCHECK(hipMemcpy(p_dev_head, &fake_head, sizeof(uint64_t*), hipMemcpyDefault));
-        *fake_head = 0;
-        // read back fakes to confirm
-        HIPCHECK(hipMemcpy(&fake_h, p_dev_head, sizeof(uint64_t*), hipMemcpyDefault));
-        //std::cerr << "[          ] replaced gpu " << gpu << " real_opCount = " << real_opCount << " to fake_opCount = " << fake_o << std::endl;
-        //std::cerr << "[          ] replaced gpu " << gpu << " real_head = " << real_head << " to fake_head = " << fake_h << std::endl;
-
-        // Perform a number of iterations and introduce abort
-        for (int j = 0; j < NUM_ITER; j++) {
-            //std::cerr << "[          ] iter = " << j << std::endl;
-            // Start a group call
-            ncclGroupStart();
-            for (int i = 0; i < numDevices; i++) {
-                ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
-                              numElements, dataType, op, comms[i], streams[i]);
-            }
-            // Signal end of group call
-            ncclGroupEnd();
-        }
-
-        // Wait for reduction to complete
-        auto start = std::chrono::high_resolution_clock::now();
-        hipError_t hipErr;
-        int remaining = numDevices;
-        int* done = (int*)malloc(sizeof(int)*numDevices);
-        memset(done, 0, sizeof(int)*numDevices);
-        bool timeout = false, abort_called = false;
-        while (remaining) {
-            int idle = 1;
-            for (int i=0; i<numDevices; i++) {
-                if (done[i]) continue;
-
-                hipErr = hipStreamQuery(streams[i]);
-                if (hipErr == hipSuccess) {
-                    done[i] = 1;
-                    remaining--;
-                    idle = 0;
-                    continue;
-                }
-
- #if NCCL_MAJOR >= 2
- #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
-                auto delta = std::chrono::high_resolution_clock::now() - start;
-                double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
-                if (deltaSec > 10.0 && !timeout) {
-                    std::cerr << "[          ] timeout condition, calling ncclCommAbort ... " << std::endl;
-                    timeout = true;
-                }
-                ncclResult_t ncclAsyncErr;
-                ncclCommGetAsyncError(comms[i], &ncclAsyncErr);
-                if ((ncclAsyncErr != ncclSuccess || timeout) && !abort_called) {
-                    // An asynchronous error happened. Stop the operation and destroy
-                    // the communicator
-                    std::cerr << "[          ] ncclAsyncErr = " << ncclAsyncErr << std::endl;
-                    for (int i=0; i<numDevices; i++)
-                      ncclCommAbort(comms[i]);
-                    // Abort the perf test
-                    abort_called = true;
-                    break;
-                }
-#endif
-#endif
-            }
-            // We might want to let other threads (including NCCL threads) use the CPU.
-            if (idle) pthread_yield();
-        }
-
-        free(done);
-        HIPCHECK(hipHostFree(fake_head));
-        HIPCHECK(hipStreamDestroy(stream));
-        dataset.Release();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(AllReduceAbortSweep,
-                            AllReduceAbortTest,
-                            testing::Combine(
-                                // Reduction operator
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclFloat32),
-                                // Number of elements
-                                testing::Values(1024, 1048576),
-                                // Number of devices
-                                testing::Values(2, 4),
-                                // In-place or not
-                                testing::Values(false),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,20 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_ALLREDUCE_HPP
-#define TEST_ALLREDUCE_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class AllReduceAbortTest : public CorrectnessTest
-    {
-    protected:
-    public:
-    };
-}
-
-#endif
@@ -1,83 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_AllReduceGroup.hpp"
-
-namespace CorrectnessTests
-{
-  // This tests aggregated AllReduce calls within a group
-  TEST_P(AllReduceGroupCorrectnessTest, Correctness)
-  {
-    if (numDevices > numDevicesAvailable) return;
-
-    // Prepare input / output / expected results
-    Dataset dataset1, dataset2, dataset3;
-    dataset1.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
-    dataset2.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
-    dataset3.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
-    FillDatasetWithPattern(dataset1);
-    FillDatasetWithPattern(dataset2);
-    FillDatasetWithPattern(dataset3);
-    ComputeExpectedResults(dataset1, op);
-    ComputeExpectedResults(dataset2, op);
-    ComputeExpectedResults(dataset3, op);
-
-    // Launch the reduction (1 thread per GPU)
-    ncclGroupStart();
-    for (int i = 0; i < numDevices; i++)
-    {
-      ncclAllReduce(dataset1.inputs[i], dataset1.outputs[i], numElements, dataType, op, comms[i], streams[i]);
-      ncclAllReduce(dataset2.inputs[i], dataset2.outputs[i], numElements, dataType, op, comms[i], streams[i]);
-      ncclAllReduce(dataset3.inputs[i], dataset3.outputs[i], numElements, dataType, op, comms[i], streams[i]);
-    }
-    ncclGroupEnd();
-
-    // Wait for reduction to complete
-    Synchronize();
-
-    // Check results
-    ValidateResults(dataset1);
-    ValidateResults(dataset2);
-    ValidateResults(dataset3);
-
-        dataset1.Release();
-        dataset2.Release();
-        dataset3.Release();
-  }
-#if defined(BUILD_ALLREDUCE_ONLY)
-  INSTANTIATE_TEST_SUITE_P(AllReduceGroupCorrectnessSweep,
-                           AllReduceGroupCorrectnessTest,
-                           testing::Combine(
-                             // Reduction operator
-                             testing::Values(ncclSum),
-                             // Data types
-                             testing::Values(ncclFloat32),
-                             // Number of elements
-                             testing::Values(1024, 1048576),
-                             // Number of devices
-                             testing::Range(2,(GTESTS_NUM_GPUS+1)),
-                             // In-place or not
-                             testing::Values(false, true),
-                             testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
-                           CorrectnessTest::PrintToStringParamName());
-#else
-  INSTANTIATE_TEST_SUITE_P(AllReduceGroupCorrectnessSweep,
-                           AllReduceGroupCorrectnessTest,
-                           testing::Combine(
-                             // Reduction operator
-                             testing::Values(ncclSum),
-                             // Data types
-                             testing::Values(ncclFloat32, ncclFloat64),
-                             // Number of elements
-                             testing::Values(1024, 1048576),
-                             // Number of devices
-                             testing::Range(2,(GTESTS_NUM_GPUS+1)),
-                             // In-place or not
-                             testing::Values(false, true),
-                             testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
-                           CorrectnessTest::PrintToStringParamName());
-#endif
-} // namespace
@@ -1,79 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_ALLREDUCEGROUP_HPP
-#define TEST_ALLREDUCEGROUP_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class AllReduceGroupCorrectnessTest : public CorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op)
-        {
-            // Copy all inputs to expected arrays temporarily to perform reduction on host
-            for (int i = 0; i < dataset.numDevices; i++)
-                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
-                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
-
-            // Allocate temporary host array to accumulate results
-            int8_t*   resultI1 = (int8_t   *)malloc(dataset.NumBytes());
-            uint8_t*  resultU1 = (uint8_t  *)resultI1;
-            int32_t*  resultI4 = (int32_t  *)resultI1;
-            uint32_t* resultU4 = (uint32_t *)resultI1;
-            int64_t*  resultI8 = (int64_t  *)resultI1;
-            uint64_t* resultU8 = (uint64_t *)resultI1;
-            float*    resultF4 = (float    *)resultI1;
-            double*   resultF8 = (double   *)resultI1;
-            rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
-
-            // Initialize the result with the first device's array
-            memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
-
-            // Perform reduction on the other device arrays
-            for (int i = 1; i < dataset.numDevices; i++)
-            {
-                int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
-                uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
-                int32_t*  arrayI4 = (int32_t  *)arrayI1;
-                uint32_t* arrayU4 = (uint32_t *)arrayI1;
-                int64_t*  arrayI8 = (int64_t  *)arrayI1;
-                uint64_t* arrayU8 = (uint64_t *)arrayI1;
-                float*    arrayF4 = (float    *)arrayI1;
-                double*   arrayF8 = (double   *)arrayI1;
-                rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
-
-                for (int j = 0; j < dataset.numElements; j++)
-                {
-                    switch (dataset.dataType)
-                    {
-                    case ncclInt8:    resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
-                    case ncclUint8:   resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
-                    case ncclInt32:   resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
-                    case ncclUint32:  resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
-                    case ncclInt64:   resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
-                    case ncclUint64:  resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
-                    case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
-                    case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
-                    case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
-                    default:
-                        fprintf(stderr, "[ERROR] Unsupported datatype\n");
-                        exit(0);
-                    }
-                }
-            }
-
-            // Copy results into expected arrays
-            for (int i = 0; i < dataset.numDevices; i++)
-                memcpy(dataset.expected[i], resultI1, dataset.NumBytes());
-
-            free(resultI1);
-        }
-    };
-}
-
-#endif
@@ -1,82 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#include "test_AllReduceGroupMultiProcess.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(AllReduceGroupMultiProcessCorrectnessTest, Correctness)
-    {
-        // Important: Make sure the order of ncclFunc_t's here match the order of ncclFunc_ts
-        // as they appear in TestGroupCalls()
-        std::vector<ncclFunc_t> ncclFuncs;
-        ncclFuncs.push_back(ncclCollAllReduce);
-        ncclFuncs.push_back(ncclCollAllReduce);
-        ncclFuncs.push_back(ncclCollAllReduce);
-
-        // Create multiple datasets for combined operation
-        std::vector<Dataset*> datasets(ncclFuncs.size());
-        for (int i = 0; i < datasets.size(); i++)
-        {
-            datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
-            datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
-        }
-
-        int const numGpusPerProcess = 2;
-        int const numProcesses = numDevices / numGpusPerProcess;
-        std::vector<int> pids(numProcesses);
-        int process = -1;
-
-        for (int i = 0; i < numDevices; i+= numGpusPerProcess)
-        {
-            process++;
-            int pid = fork();
-            if (pid == 0)
-            {
-                int gpuIdx = i;
-                int maxIdx = gpuIdx + (numGpusPerProcess - 1) >= numDevices ? numDevices : gpuIdx + numGpusPerProcess;
-
-                std::vector<int> ranks;
-                for (; gpuIdx < maxIdx; gpuIdx++)
-                {
-                    ranks.push_back(gpuIdx);
-                }
-
-                bool pass;
-                TestGroupCalls(process, ranks, datasets, ncclFuncs, pass);
-                TerminateChildProcess(pass);
-            }
-            else
-            {
-                pids[process] = pid;
-            }
-        }
-
-        ValidateProcesses(pids);
-
-        for (int i = 0; i < datasets.size(); i++)
-        {
-            datasets[i]->ReleaseRootProcess();
-            munmap(datasets[i], sizeof(Dataset));
-        }
-    }
-
-    INSTANTIATE_TEST_SUITE_P(AllReduceGroupMultiProcessCorrectnessSweep,
-                            AllReduceGroupMultiProcessCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator (not used)
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclFloat32,
-                                                ncclFloat64),
-                                // Number of elements
-                                testing::Values(3072, 3145728),
-                                // Number of devices
-                                testing::Values(4,8),
-                                // In-place or not
-                                testing::Values(false, true),
-                                testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,105 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef TEST_ALLREDUCEGROUP_MULTI_PROCESS_HPP
-#define TEST_ALLREDUCEGROUP_MULTI_PROCESS_HPP
-
-#include "CorrectnessTest.hpp"
-#include "test_AllReduceMultiProcess.hpp"
-#include <string>
-
-namespace CorrectnessTests
-{
-    class AllReduceGroupMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
-    {
-    public:
-        void TestGroupCalls(int process, std::vector<int> const& ranks, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs, bool& pass)
-        {
-            ncclGroupStart();
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                SetUpPerProcess(ranks[i], funcs, comms[ranks[i]], streams[ranks[i]], datasets);
-                if (numDevices > numDevicesAvailable)
-                {
-                    break;
-                }
-            }
-            ncclGroupEnd();
-
-            if (numDevices > numDevicesAvailable)
-            {
-                pass = true;
-                return;
-            }
-
-            int numProcesses = numDevices / ranks.size();
-            Barrier barrier(process, numProcesses, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
-
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                for (int j = 0; j < datasets.size(); j++)
-                {
-                    FillDatasetWithPattern(*datasets[j], ranks[i]);
-                }
-            }
-
-            int const root = 0;
-
-            for (int i = 0; i < 3; i++)
-            {
-              AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[i], barrier, op, ranks);
-            }
-            barrier.Wait();
-
-            size_t const byteCount = datasets[0]->NumBytes() / numDevices;
-            size_t const elemCount = numElements / numDevices;
-
-            ncclGroupStart();
-            // AllReduce
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                int rank = ranks[i];
-                for (int j = 0; j < 3; j++)
-                {
-                  ncclAllReduce(datasets[j]->inputs[rank], datasets[j]->outputs[rank],
-                                numElements, dataType, op, comms[rank], streams[rank]);
-                }
-            }
-            // Signal end of group call
-            ncclGroupEnd();
-
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                HIP_CALL(hipSetDevice(ranks[i]));
-                HIP_CALL(hipStreamSynchronize(streams[ranks[i]]));
-            }
-
-            for (int i = 0; i < funcs.size(); i++)
-            {
-                for (int j = 0; j < ranks.size(); j++)
-                {
-                    pass = ValidateResults(*datasets[i], ranks[j], root);
-                    if (!pass)
-                    {
-                        break;
-                    }
-                }
-                barrier.Wait();
-                for (int j = 0; j < ranks.size(); j++)
-                {
-                    datasets[i]->Release(ranks[j]);
-                }
-            }
-
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                TearDownPerProcess(comms[ranks[i]], streams[ranks[i]]);
-            }
-        }
-    };
-}
-
-#endif
@@ -1,61 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_AllReduceMultiProcess.hpp"
-
-namespace CorrectnessTests
-{
-  TEST_P(AllReduceMultiProcessCorrectnessTest, Correctness)
-  {
-    dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
-    std::vector<int> pids(numDevices);
-
-    int gpu = -1;
-    for (int i = 0; i < numDevices; i++)
-    {
-      gpu++;
-      int pid = fork();
-      if (pid == 0)
-      {
-        bool pass;
-        TestAllReduce(gpu, *dataset, pass);
-        TerminateChildProcess(pass);
-      }
-      else
-      {
-        pids[gpu] = pid;
-      }
-    }
-
-    ValidateProcesses(pids);
-    dataset->ReleaseRootProcess();
-  }
-
-  INSTANTIATE_TEST_SUITE_P(AllReduceMultiProcessCorrectnessSweep,
-                           AllReduceMultiProcessCorrectnessTest,
-                           testing::Combine(
-                             // Reduction operator
-                             testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
-                             // Data types
-                             testing::Values(ncclInt8,
-                                             ncclUint8,
-                                             ncclInt32,
-                                             ncclUint32,
-                                             ncclInt64,
-                                             ncclUint64,
-                                             //ncclFloat16,
-                                             ncclFloat32,
-                                             ncclFloat64,
-                                             ncclBfloat16),
-                             // Number of elements
-                             testing::Values(1024, 1048576),
-                             // Number of devices
-                             testing::Values(2,3,4,8),
-                             // In-place or not
-                             testing::Values(false, true),
-                             testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
-                           CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,117 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_ALLREDUCE_MULTI_PROCESS_HPP
-#define TEST_ALLREDUCE_MULTI_PROCESS_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class AllReduceMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, std::vector<int> const& ranks)
-        {
-            // Copy all inputs to expected arrays temporarily to perform reduction on host
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                int rank = ranks[i];
-                HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
-                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
-            }
-            barrier.Wait();
-
-            // Allocate temporary host array to accumulate results
-            int8_t*   resultI1 = (int8_t   *)malloc(dataset.NumBytes());
-            uint8_t*  resultU1 = (uint8_t  *)resultI1;
-            int32_t*  resultI4 = (int32_t  *)resultI1;
-            uint32_t* resultU4 = (uint32_t *)resultI1;
-            int64_t*  resultI8 = (int64_t  *)resultI1;
-            uint64_t* resultU8 = (uint64_t *)resultI1;
-            float*    resultF4 = (float    *)resultI1;
-            double*   resultF8 = (double   *)resultI1;
-            rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
-
-            // Initialize the result with the first device's array
-            memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
-            barrier.Wait();
-
-            // Perform reduction
-            for (int i = 1; i < dataset.numDevices; i++)
-            {
-                int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
-                uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
-                int32_t*  arrayI4 = (int32_t  *)arrayI1;
-                uint32_t* arrayU4 = (uint32_t *)arrayI1;
-                int64_t*  arrayI8 = (int64_t  *)arrayI1;
-                uint64_t* arrayU8 = (uint64_t *)arrayI1;
-                float*    arrayF4 = (float    *)arrayI1;
-                double*   arrayF8 = (double   *)arrayI1;
-                rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
-
-                for (int j = 0; j < dataset.numElements; j++)
-                {
-                    switch (dataset.dataType)
-                    {
-                    case ncclInt8:    resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
-                    case ncclUint8:   resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
-                    case ncclInt32:   resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
-                    case ncclUint32:  resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
-                    case ncclInt64:   resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
-                    case ncclUint64:  resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
-                    case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
-                    case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
-                    case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
-                    default:
-                        fprintf(stderr, "[ERROR] Unsupported datatype\n");
-                        exit(0);
-                    }
-                }
-            }
-            barrier.Wait();
-
-            // Copy results into expected array
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                int rank = ranks[i];
-                memcpy(dataset.expected[rank], resultI1, dataset.NumBytes());
-            }
-            free(resultI1);
-        }
-
-        void TestAllReduce(int rank, Dataset& dataset, bool& pass)
-        {
-            SetUpPerProcess(rank, ncclCollAllReduce, comms[rank], streams[rank], dataset);
-
-            if (numDevices > numDevicesAvailable)
-            {
-                pass = true;
-                return;
-            }
-
-            Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
-
-            // Prepare input / output / expected results
-            FillDatasetWithPattern(dataset, rank);
-            ComputeExpectedResults(dataset, barrier, op, std::vector<int>(1, rank));
-
-            // Launch the reduction
-            ncclAllReduce(dataset.inputs[rank], dataset.outputs[rank],
-                          numElements, dataType, op, comms[rank], streams[rank]);
-
-            // Wait for reduction to complete
-            HIP_CALL(hipStreamSynchronize(streams[rank]));
-
-            // Check results
-            pass = ValidateResults(dataset, rank);
-
-            TearDownPerProcess(comms[rank], streams[rank]);
-            dataset.Release(rank);
-        }
-    };
-}
-
-#endif
@@ -1,67 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_AllToAll.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(AllToAllCorrectnessTest, Correctness)
-    {
-        if (numDevices > numDevicesAvailable) return;
-
-        // Allocate data
-        Dataset dataset;
-        dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllToAll);
-
-        // Prepare input / output / expected results
-        FillDatasetWithPattern(dataset);
-        ComputeExpectedResults(dataset);
-
-        // Launch the reduction (1 thread per GPU)
-        ncclGroupStart();
-        for (int i = 0; i < numDevices; i++)
-        {
-            ncclAllToAll(dataset.inputs[i],
-                          dataset.outputs[i],
-                          numElements, dataType,
-                          comms[i], streams[i]);
-        }
-        ncclGroupEnd();
-
-        // Wait for reduction to complete
-        Synchronize();
-
-        // Check results
-        ValidateResults(dataset);
-
-        dataset.Release();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(AllToAllCorrectnessSweep,
-                            AllToAllCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator is not used
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(1024, 1048576),
-                                // Number of devices
-                                testing::Range(2,(GTESTS_NUM_GPUS+1)),
-                                // In-place or not
-                                testing::Values(false),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,26 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_ALLTOALL_HPP
-#define TEST_ALLTOALL_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class AllToAllCorrectnessTest : public CorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset)
-        {
-            for (int i = 0; i < dataset.numDevices; i++)
-                for (int j = 0; j < dataset.numDevices; j++)
-                    HIP_CALL(hipMemcpy((int8_t *)dataset.expected[i]+dataset.NumBytes()*j, (int8_t *)dataset.inputs[j]+dataset.NumBytes()*i,
-                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
-        }
-    };
-}
-
-#endif
@@ -1,61 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_AllToAllMultiProcess.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(AllToAllMultiProcessCorrectnessTest, Correctness)
-    {
-        dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllToAll);
-        std::vector<int> pids(numDevices);
-
-        int gpu = -1;
-        for (int i = 0; i < numDevices; i++)
-        {
-            gpu++;
-            int pid = fork();
-            if (pid == 0)
-            {
-                bool pass;
-                TestAllToAll(gpu, *dataset, pass);
-                TerminateChildProcess(pass);
-            }
-            else
-            {
-                pids[gpu] = pid;
-            }
-        }
-
-        ValidateProcesses(pids);
-        dataset->ReleaseRootProcess();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(AllToAllMultiProcessCorrectnessSweep,
-                            AllToAllMultiProcessCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator is not used
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(1024, 1048576),
-                                // Number of devices
-                                testing::Values(2,3,4,8),
-                                // In-place or not
-                                testing::Values(false),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,61 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_ALLTOALL_MULTI_PROCESS_HPP
-#define TEST_ALLTOALL_MULTI_PROCESS_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class AllToAllMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset, std::vector<int> const& ranks)
-        {
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                int rank = ranks[i];
-                for (int j = 0; j < dataset.numDevices; j++)
-                {
-                    HIP_CALL(hipMemcpy((int8_t *)dataset.expected[j]+dataset.NumBytes()*rank, (int8_t *)dataset.inputs[rank]+dataset.NumBytes()*j,
-                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
-                }
-            }
-        }
-
-        void TestAllToAll(int rank, Dataset& dataset, bool& pass)
-        {
-            SetUpPerProcess(rank, ncclCollAllToAll, comms[rank], streams[rank], dataset);
-
-            if (numDevices > numDevicesAvailable)
-            {
-                pass = true;
-                return;
-            }
-
-            // Prepare input / output / expected results
-            FillDatasetWithPattern(dataset, rank);
-            ComputeExpectedResults(dataset, std::vector<int>(1, rank));
-
-            // Launch the reduction
-            ncclAllToAll(dataset.inputs[rank],
-                         dataset.outputs[rank],
-                         numElements, dataType,
-                         comms[rank], streams[rank]);
-
-            // Wait for reduction to complete
-            HIP_CALL(hipStreamSynchronize(streams[rank]));
-
-            // Check results
-            pass = ValidateResults(dataset, rank);
-
-            TearDownPerProcess(comms[rank], streams[rank]);
-            dataset.Release(rank);
-        }
-    };
-}
-
-#endif
@@ -1,75 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_AllToAllv.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(AllToAllvCorrectnessTest, Correctness)
-    {
-        if (numDevices > numDevicesAvailable) return;
-
-        // Allocate data
-        Dataset dataset;
-        dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllToAll);
-
-        // Prepare input / output / expected results
-        FillDatasetWithPattern(dataset);
-        ComputeExpectedResults(dataset);
-
-        size_t chunksize = numElements*2/numDevices;
-        #define MAX_ALLTOALLV_RANKS 16
-        static size_t sendcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS];
-        // Launch the reduction (1 thread per GPU)
-        ncclGroupStart();
-        for (int r = 0; r < numDevices; r++) {
-            size_t disp = 0;
-            for (int i = 0; i < numDevices; i++) {
-                size_t scount = ((i+r)%numDevices)*chunksize;
-                if (i+r == numDevices-1)
-                  scount += (numElements*numDevices-chunksize*(numDevices-1)*numDevices/2);
-                sendcounts[i+r*MAX_ALLTOALLV_RANKS] = recvcounts[i+r*MAX_ALLTOALLV_RANKS] = scount;
-                sdispls[i+r*MAX_ALLTOALLV_RANKS] = rdispls[i+r*MAX_ALLTOALLV_RANKS] = disp;
-                disp += scount;
-            }
-            ncclAllToAllv((char*)dataset.inputs[r], sendcounts+r*MAX_ALLTOALLV_RANKS, sdispls+r*MAX_ALLTOALLV_RANKS,
-              (char*)dataset.outputs[r], recvcounts+r*MAX_ALLTOALLV_RANKS, rdispls+r*MAX_ALLTOALLV_RANKS, dataType, comms[r], streams[r]);
-        }
-        ncclGroupEnd();
-        // Wait for reduction to complete
-        Synchronize();
-
-        // Check results
-        ValidateResults(dataset);
-
-        dataset.Release();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(AllToAllvCorrectnessSweep,
-                            AllToAllvCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator is not used
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(2520, 3026520),
-                                // Number of devices
-                                testing::Range(2,(GTESTS_NUM_GPUS+1)),
-                                // In-place or not
-                                testing::Values(false),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,44 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_ALLTOALLV_HPP
-#define TEST_ALLTOALLV_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class AllToAllvCorrectnessTest : public CorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset)
-        {
-            for (int i = 0; i < dataset.numDevices; i++) {
-                size_t rdisp = 0;
-                size_t chunksize = dataset.numElements*2/dataset.numDevices;
-                for (int j = 0; j < dataset.numDevices; j++) {
-                    size_t scount = 0, rcount = ((j+i)%dataset.numDevices)*chunksize;
-                    if (j+i == dataset.numDevices-1)
-                        rcount += (dataset.numElements*dataset.numDevices-chunksize*(dataset.numDevices-1)*dataset.numDevices/2);
-                    size_t sdisp = 0;
-                    for (int k=0; k<dataset.numDevices; k++) {
-                        scount = ((k+j)%dataset.numDevices)*chunksize;
-                        if (k+j == dataset.numDevices-1)
-                          scount += (dataset.numElements*dataset.numDevices-chunksize*(dataset.numDevices-1)*dataset.numDevices/2);
-                        if (k == i)
-                          break;
-                        sdisp += scount;
-                    }
-                    HIP_CALL(hipMemcpy((int8_t *)dataset.expected[i]+rdisp*DataTypeToBytes(dataset.dataType),
-                        (int8_t *)dataset.inputs[j]+sdisp*DataTypeToBytes(dataset.dataType),
-                        rcount*DataTypeToBytes(dataset.dataType), hipMemcpyDeviceToHost));
-                    rdisp += rcount;
-                }
-            }
-        }
-    };
-}
-
-#endif
@@ -1,71 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_Broadcast.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(BroadcastCorrectnessTest, Correctness)
-    {
-        if (numDevices > numDevicesAvailable) return;
-
-        // Allocate data
-        Dataset dataset;
-        dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollBroadcast);
-
-        // Test each possible root
-        for (int root = 0; root < numDevices; root++)
-        {
-            // Prepare input / output / expected results
-            FillDatasetWithPattern(dataset);
-            ComputeExpectedResults(dataset, root);
-
-            // Launch the reduction (1 thread per GPU)
-            ncclGroupStart();
-            for (int i = 0; i < numDevices; i++)
-            {
-                ncclBroadcast(dataset.inputs[i],
-                              dataset.outputs[i],
-                              numElements, dataType,
-                              root, comms[i], streams[i]);
-            }
-            ncclGroupEnd();
-
-            // Wait for reduction to complete
-            Synchronize();
-
-            // Check results
-            ValidateResults(dataset);
-        }
-
-        dataset.Release();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(BroadcastCorrectnessSweep,
-                            BroadcastCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator is not used
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(1024, 1048576),
-                                // Number of devices
-                                testing::Range(2,(GTESTS_NUM_GPUS+1)),
-                                // In-place or not
-                                testing::Values(false, true),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,25 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_BROADCAST_HPP
-#define TEST_BROADCAST_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class BroadcastCorrectnessTest : public CorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset, int const root)
-        {
-            for (int i = 0; i < dataset.numDevices; i++)
-                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[root],
-                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
-        }
-    };
-}
-
-#endif
@@ -1,140 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_BroadcastAbort.hpp"
-#include "../include/comm.h"
-
-#define NUM_ITER 8
-#define FAKE_OP_COUNT NUM_ITER+1
-
-namespace CorrectnessTests
-{
-    #define HIPCHECK(cmd)                                                          \
-    do {                                                                           \
-      hipError_t error = (cmd);                                                    \
-      if (error != hipSuccess) {                                                   \
-        std::cerr << "Encountered HIP error (" << error << ") at line "            \
-                  << __LINE__ << " in file " << __FILE__ << "\n";                  \
-        exit(-1);                                                                  \
-      }                                                                            \
-    } while (0)
-
-    #define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
-    #define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
-
-    TEST_P(BroadcastAbortTest, Correctness) {
-        if (numDevices > numDevicesAvailable) return;
-
-        // Prepare input / output / expected results
-        Dataset dataset;
-        dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollBroadcast);
-        FillDatasetWithPattern(dataset);
-
-        int root = 0;
-        int gpu = 0; // GPU number to trigger abort
-        ncclComm_t comm = comms[gpu];
-
-        HIPCHECK(hipSetDevice(gpu));
-        hipStream_t stream;
-        HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-        struct ncclChannel* channel = comm->channels;
-        uint64_t **p_dev_head = (uint64_t **)((uint8_t*)(channel->devPeers + channel->ring.next) + offsetof(struct ncclPeer, send[0].conn.head));
-        uint64_t *real_head, *fake_head, *fake_h;
-
-        // get original head
-        HIPCHECK(hipMemcpy(&real_head, p_dev_head, sizeof(uint64_t*), hipMemcpyDefault));
-        // allocate and install fakes
-        HIPCHECK(hipHostMalloc(&fake_head, sizeof(uint64_t*), hipHostMallocMapped));
-        HIPCHECK(hipMemcpy(p_dev_head, &fake_head, sizeof(uint64_t*), hipMemcpyDefault));
-        *fake_head = 0;
-        // read back fakes to confirm
-        HIPCHECK(hipMemcpy(&fake_h, p_dev_head, sizeof(uint64_t*), hipMemcpyDefault));
-        //std::cerr << "[          ] replaced gpu " << gpu << " real_head = " << real_head << " to fake_head = " << fake_h << std::endl;
-
-        // Perform a number of iterations and introduce abort
-        for (int j = 0; j < NUM_ITER; j++) {
-            //std::cerr << "[          ] iter = " << j << std::endl;
-            // Start a group call
-            ncclGroupStart();
-            for (int i = 0; i < numDevices; i++) {
-                ncclBroadcast(dataset.inputs[i],
-                              dataset.outputs[i],
-                              numElements, dataType,
-                              root, comms[i], streams[i]);
-            }
-            // Signal end of group call
-            ncclGroupEnd();
-        }
-
-        // Wait for reduction to complete
-        auto start = std::chrono::high_resolution_clock::now();
-        hipError_t hipErr;
-        int remaining = numDevices;
-        int* done = (int*)malloc(sizeof(int)*numDevices);
-        memset(done, 0, sizeof(int)*numDevices);
-        bool timeout = false, abort_called = false;
-        while (remaining) {
-            int idle = 1;
-            for (int i=0; i<numDevices; i++) {
-                if (done[i]) continue;
-
-                hipErr = hipStreamQuery(streams[i]);
-                if (hipErr == hipSuccess) {
-                    done[i] = 1;
-                    remaining--;
-                    idle = 0;
-                    continue;
-                }
-
- #if NCCL_MAJOR >= 2
- #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
-                auto delta = std::chrono::high_resolution_clock::now() - start;
-                double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
-                if (deltaSec > 10.0 && !timeout) {
-                    std::cerr << "[          ] timeout condition, calling ncclCommAbort ... " << std::endl;
-                    timeout = true;
-                }
-                ncclResult_t ncclAsyncErr;
-                ncclCommGetAsyncError(comms[i], &ncclAsyncErr);
-                if ((ncclAsyncErr != ncclSuccess || timeout) && !abort_called) {
-                    // An asynchronous error happened. Stop the operation and destroy
-                    // the communicator
-                    std::cerr << "[          ] ncclAsyncErr = " << ncclAsyncErr << std::endl;
-                    for (int i=0; i<numDevices; i++)
-                      ncclCommAbort(comms[i]);
-                    // Abort the perf test
-                    abort_called = true;
-                    break;
-                }
-#endif
-#endif
-            }
-            // We might want to let other threads (including NCCL threads) use the CPU.
-            if (idle) pthread_yield();
-        }
-
-        free(done);
-        HIPCHECK(hipHostFree(fake_head));
-        HIPCHECK(hipStreamDestroy(stream));
-        dataset.Release();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(BroadcastAbortSweep,
-                            BroadcastAbortTest,
-                            testing::Combine(
-                                // Reduction operator
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclFloat32),
-                                // Number of elements
-                                testing::Values(1048576),
-                                // Number of devices
-                                testing::Values(2, 4),
-                                // In-place or not
-                                testing::Values(false),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,20 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_ALLREDUCE_HPP
-#define TEST_ALLREDUCE_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class BroadcastAbortTest : public CorrectnessTest
-    {
-    protected:
-    public:
-    };
-}
-
-#endif
@@ -1,68 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_BroadcastMultiProcess.hpp"
-
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <iostream>
-
-namespace CorrectnessTests
-{
-    TEST_P(BroadcastMultiProcessCorrectnessTest, Correctness)
-    {
-        dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollBroadcast);
-        std::vector<int> pids(numDevices);
-
-        int gpu = -1;
-        for (int i = 0; i < numDevices; i++)
-        {
-            gpu++;
-            int pid = fork();
-            if (pid == 0)
-            {
-                bool pass;
-                TestBroadcast(gpu, *dataset, pass);
-                TerminateChildProcess(pass);
-            }
-            else
-            {
-                pids[gpu] = pid;
-            }
-        }
-
-        ValidateProcesses(pids);
-        dataset->ReleaseRootProcess();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(BroadcastMultiProcessCorrectnessSweep,
-                            BroadcastMultiProcessCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator is not used
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(1024, 1048576),
-                                // Number of devices
-                                testing::Values(2,3,4,8),
-                                // In-place or not
-                                testing::Values(false, true),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,77 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_BROADCAST_MULTI_PROCESS_HPP
-#define TEST_BROADCAST_MULTI_PROCESS_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class BroadcastMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset, int const root, std::vector<int> const& ranks)
-        {
-            for (int h = 0; h < ranks.size(); h++)
-            {
-                int rank = ranks[h];
-                // Root has the answer; share it via host memcpy's
-                if (rank == root)
-                {
-                    HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
-                                       dataset.NumBytes(), hipMemcpyDeviceToHost));
-                    for (int i = 0; i < dataset.numDevices; i++)
-                    {
-                        if (i == rank) continue;
-                        memcpy(dataset.expected[i], dataset.expected[root], dataset.NumBytes());
-                    }
-                    break;
-                }
-            }
-        }
-
-        void TestBroadcast(int rank, Dataset& dataset, bool& pass)
-        {
-            SetUpPerProcess(rank, ncclCollBroadcast, comms[rank], streams[rank], dataset);
-
-            if (numDevices > numDevicesAvailable)
-            {
-                pass = true;
-                return;
-            }
-
-            Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
-
-            // Test each possible root
-            for (int root = 0; root < numDevices; root++)
-            {
-                // Prepare input / output / expected results
-                FillDatasetWithPattern(dataset, rank);
-                ComputeExpectedResults(dataset, root, std::vector<int>(1, rank));
-
-                // Launch the reduction (1 process per GPU)
-                ncclResult_t res = ncclBroadcast(dataset.inputs[rank],
-                              dataset.outputs[rank],
-                              numElements, dataType,
-                              root, comms[rank], streams[rank]);
-
-                // Wait for reduction to complete
-                HIP_CALL(hipStreamSynchronize(streams[rank]));
-
-                // Check results
-                pass = ValidateResults(dataset, rank);
-
-                // Ensure all processes have finished current iteration before proceeding
-                barrier.Wait();
-            }
-
-            TearDownPerProcess(comms[rank], streams[rank]);
-            dataset.Release(rank);
-        }
-    };
-}
-
-#endif
@@ -1,129 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#include "test_CombinedCalls.hpp"
-
-#include "test_AllGather.hpp"
-#include "test_AllReduce.hpp"
-#include "test_Broadcast.hpp"
-#include "test_Reduce.hpp"
-#include "test_ReduceScatter.hpp"
-#include "test_Scatter.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(CombinedCallsCorrectnessTest, Correctness)
-    {
-        if (numDevices > numDevicesAvailable) return;
-
-        // Create multiple datasets for combined operation
-        std::vector<Dataset> datasets(5);
-        std::vector<ncclFunc_t> ncclFuncs(5);
-        ncclFuncs.push_back(ncclCollAllGather);
-        ncclFuncs.push_back(ncclCollAllReduce);
-        ncclFuncs.push_back(ncclCollBroadcast);
-        ncclFuncs.push_back(ncclCollReduce);
-        ncclFuncs.push_back(ncclCollReduceScatter);
-
-        // Adjust numElements to be multiple of numDevices
-        numElements = (numElements/numDevices)*numDevices;
-        for (int i = 0; i < datasets.size(); i++)
-        {
-            datasets[i].Initialize(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
-            FillDatasetWithPattern(datasets[i]);
-        }
-
-        Dataset scatter_dataset;
-        scatter_dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollScatter);
-        FillDatasetWithPattern(scatter_dataset);
-
-        // Compute expected results for each dataset in combined
-        int const root = 0;
-        AllGatherCorrectnessTest::ComputeExpectedResults(datasets[0]);
-        AllReduceCorrectnessTest::ComputeExpectedResults(datasets[1], op);
-        BroadcastCorrectnessTest::ComputeExpectedResults(datasets[2], root);
-        ReduceCorrectnessTest::ComputeExpectedResults(datasets[3], op, root);
-        ReduceScatterCorrectnessTest::ComputeExpectedResults(datasets[4], op);
-        ScatterCorrectnessTest::ComputeExpectedResults(scatter_dataset, root);
-
-        size_t const byteCount = datasets[0].NumBytes() / numDevices;
-        size_t const elemCount = numElements / numDevices;
-
-        for (int j = 0; j < 10; j++)
-        {
-            ncclGroupStart();
-            for (int i = 0; i < numDevices; i++)
-            {
-                ncclScatter(scatter_dataset.inputs[i],
-                              scatter_dataset.outputs[i],
-                              numElements, dataType,
-                              root, comms[i], streams[i]);
-            }
-            ncclGroupEnd();
-            ncclGroupStart();
-            for (int i = 0; i < numDevices; i++)
-            {
-                ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
-                              datasets[0].outputs[i], elemCount,
-                              dataType, comms[i], streams[i]);
-
-                ncclAllReduce(datasets[1].inputs[i], datasets[1].outputs[i],
-                              numElements, dataType, op, comms[i], streams[i]);
-
-                ncclBroadcast(datasets[2].inputs[i],
-                              datasets[2].outputs[i],
-                              numElements, dataType,
-                              root, comms[i], streams[i]);
-
-                ncclReduce(datasets[3].inputs[i],
-                           datasets[3].outputs[i],
-                           numElements, dataType, op,
-                           root, comms[i], streams[i]);
-
-                ncclReduceScatter(datasets[4].inputs[i],
-                                  (int8_t *)datasets[4].outputs[i] + (i * byteCount),
-                                  elemCount, dataType, op,
-                                  comms[i], streams[i]);
-            }
-            ncclGroupEnd();
-            // Wait for reduction to complete
-            Synchronize();
-            // Check results for each collective in the combined
-            for (int i = 0; i < 5; i++)
-                ValidateResults(datasets[i]);
-
-            ValidateResults(scatter_dataset);
-        }
-
-        for (int i = 0; i < 5; i++)
-            datasets[i].Release();
-        scatter_dataset.Release();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(CombinedCallsCorrectnessSweep,
-                            CombinedCallsCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator (not used)
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(2520, 3026520),
-                                // Number of devices
-                                testing::Range(2,(GTESTS_NUM_GPUS+1)),
-                                // In-place or not
-                                testing::Values(false),
-                                testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1", "RCCL_P2P_NET_DISABLE=0", "RCCL_P2P_NET_DISABLE=1")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,17 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef TEST_COMBINEDCALLS_HPP
-#define TEST_COMBINEDCALLS_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class CombinedCallsCorrectnessTest : public CorrectnessTest {};
-}
-
-#endif
@@ -1,81 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#include "test_CombinedCallsMultiProcess.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(CombinedCallsMultiProcessCorrectnessTest, Correctness)
-    {
-        // Important: Make sure the order of ncclFunc_t's here match the order of ncclFunc_ts
-        // as they appear in TestCombinedCalls()
-        std::vector<ncclFunc_t> ncclFuncs;
-        ncclFuncs.push_back(ncclCollAllGather);
-        ncclFuncs.push_back(ncclCollAllReduce);
-        ncclFuncs.push_back(ncclCollBroadcast);
-        ncclFuncs.push_back(ncclCollReduce);
-        ncclFuncs.push_back(ncclCollReduceScatter);
-
-        // Create multiple datasets for combined operation
-        std::vector<Dataset*> datasets(ncclFuncs.size());
-        for (int i = 0; i < datasets.size(); i++)
-        {
-            datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
-            datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
-        }
-
-        std::vector<int> pids(numDevices);
-
-        int gpu = -1;
-        for (int i = 0; i < numDevices; i++)
-        {
-            gpu++;
-            int pid = fork();
-            if (pid == 0)
-            {
-                bool pass;
-                TestCombinedCalls(gpu, datasets, ncclFuncs, pass);
-                TerminateChildProcess(pass);
-            }
-            else
-            {
-                pids[gpu] = pid;
-            }
-        }
-
-        ValidateProcesses(pids);
-
-        for (int i = 0; i < datasets.size(); i++)
-        {
-            datasets[i]->ReleaseRootProcess();
-            munmap(datasets[i], sizeof(Dataset));
-        }
-    }
-
-    INSTANTIATE_TEST_SUITE_P(CombinedCallsMultiProcessCorrectnessSweep,
-                            CombinedCallsMultiProcessCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator (not used)
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(3072, 3145728),
-                                // Number of devices
-                                testing::Values(2,3,4,8),
-                                // In-place or not
-                                testing::Values(false, true),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,97 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef TEST_COMBINEDCALLS_MULTI_PROCESS_HPP
-#define TEST_COMBINEDCALLS_MULTI_PROCESS_HPP
-
-#include "CorrectnessTest.hpp"
-
-#include "test_AllGatherMultiProcess.hpp"
-#include "test_AllReduceMultiProcess.hpp"
-#include "test_BroadcastMultiProcess.hpp"
-#include "test_ReduceMultiProcess.hpp"
-#include "test_ReduceScatterMultiProcess.hpp"
-
-namespace CorrectnessTests
-{
-    class CombinedCallsMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
-    {
-    public:
-        void TestCombinedCalls(int rank, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs, bool& pass)
-        {
-            SetUpPerProcess(rank, funcs, comms[rank], streams[rank], datasets);
-
-            if (numDevices > numDevicesAvailable)
-            {
-                pass = true;
-                return;
-            }
-
-            Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
-
-            // Compute expected results for each dataset in combined
-            int const root = 0;
-            std::vector<int> ranks(1, rank);
-            AllGatherMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[0], barrier, numDevices, ranks);
-            AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[1], barrier, op, ranks);
-            BroadcastMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[2], root, ranks);
-            ReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[3], barrier, op, root, ranks);
-            ReduceScatterMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[4], barrier, op, ranks);
-
-            size_t const byteCount = datasets[0]->NumBytes() / numDevices;
-            size_t const elemCount = numElements / numDevices;
-
-            ncclAllGather((int8_t *)datasets[0]->inputs[rank] + (rank * byteCount),
-                          datasets[0]->outputs[rank], elemCount,
-                          dataType, comms[rank], streams[rank]);
-
-            ncclAllReduce(datasets[1]->inputs[rank], datasets[1]->outputs[rank],
-                          numElements, dataType, op, comms[rank], streams[rank]);
-
-            ncclBroadcast(datasets[2]->inputs[rank],
-                          datasets[2]->outputs[rank],
-                          numElements, dataType,
-                          root, comms[rank], streams[rank]);
-
-            ncclReduce(datasets[3]->inputs[rank],
-                       datasets[3]->outputs[rank],
-                       numElements, dataType, op,
-                       root, comms[rank], streams[rank]);
-
-            ncclReduceScatter(datasets[4]->inputs[rank],
-                              (int8_t *)datasets[4]->outputs[rank] + (rank * byteCount),
-                              elemCount, dataType, op,
-                              comms[rank], streams[rank]);
-
-            // Wait for reduction to complete
-            HIP_CALL(hipStreamSynchronize(streams[rank]));
-
-            for (int i = 0; i < funcs.size(); i++)
-            {
-                for (int j = 0; j < ranks.size(); j++)
-                {
-                    pass = ValidateResults(*datasets[i], ranks[j], root);
-                    if (!pass)
-                    {
-                        break;
-                    }
-                }
-                barrier.Wait();
-                for (int j = 0; j < ranks.size(); j++)
-                {
-                    datasets[i]->Release(ranks[j]);
-                }
-            }
-
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                TearDownPerProcess(comms[ranks[i]], streams[ranks[i]]);
-            }
-        }
-    };
-}
-
-#endif
@@ -1,71 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_Gather.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(GatherCorrectnessTest, Correctness)
-    {
-        if (numDevices > numDevicesAvailable) return;
-
-        // Allocate data
-        Dataset dataset;
-        dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollGather);
-
-        // Test each possible root
-        for (int root = 0; root < numDevices; root++)
-        {
-            // Prepare input / output / expected results
-            FillDatasetWithPattern(dataset);
-            ComputeExpectedResults(dataset, root);
-
-            // Launch the reduction (1 thread per GPU)
-            ncclGroupStart();
-            for (int i = 0; i < numDevices; i++)
-            {
-                ncclGather(dataset.inputs[i],
-                              dataset.outputs[i],
-                              numElements, dataType,
-                              root, comms[i], streams[i]);
-            }
-            ncclGroupEnd();
-
-            // Wait for reduction to complete
-            Synchronize();
-
-            // Check results
-            ValidateResults(dataset, root);
-        }
-
-        dataset.Release();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(GatherCorrectnessSweep,
-                            GatherCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator is not used
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(1024, 1048576),
-                                // Number of devices
-                                testing::Range(2,(GTESTS_NUM_GPUS+1)),
-                                // In-place or not
-                                testing::Values(false),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,25 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_GATHER_HPP
-#define TEST_GATHER_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class GatherCorrectnessTest : public CorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset, int const root)
-        {
-            for (int i = 0; i < dataset.numDevices; i++)
-                HIP_CALL(hipMemcpy((int8_t *)dataset.expected[root]+dataset.NumBytes()*i, dataset.inputs[i],
-                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
-        }
-    };
-}
-
-#endif
@@ -1,61 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_GatherMultiProcess.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(GatherMultiProcessCorrectnessTest, Correctness)
-    {
-        dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollGather);
-        std::vector<int> pids(numDevices);
-
-        int gpu = -1;
-        for (int i = 0; i < numDevices; i++)
-        {
-            gpu++;
-            int pid = fork();
-            if (pid == 0)
-            {
-                bool pass;
-                TestGather(gpu, *dataset, pass);
-                TerminateChildProcess(pass);
-            }
-            else
-            {
-                pids[gpu] = pid;
-            }
-        }
-
-        ValidateProcesses(pids);
-        dataset->ReleaseRootProcess();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(GatherMultiProcessCorrectnessSweep,
-                            GatherMultiProcessCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator is not used
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(1024, 1048576),
-                                // Number of devices
-                                testing::Values(2,3,4,8),
-                                // In-place or not
-                                testing::Values(false),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,63 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_GATHER_MULTI_PROCESS_HPP
-#define TEST_GATHER_MULTI_PROCESS_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class GatherMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset, int const root, int const rank)
-        {
-            HIP_CALL(hipMemcpy((int8_t *)dataset.expected[root]+dataset.NumBytes()*rank, dataset.inputs[rank],
-                               dataset.NumBytes(), hipMemcpyDeviceToHost));
-        }
-
-        void TestGather(int rank, Dataset& dataset, bool& pass)
-        {
-            SetUpPerProcess(rank, ncclCollGather, comms[rank], streams[rank], dataset);
-
-            if (numDevices > numDevicesAvailable)
-            {
-                pass = true;
-                return;
-            }
-
-            Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
-
-            // Test each possible root
-            for (int root = 0; root < numDevices; root++)
-            {
-                // Prepare input / output / expected results
-                FillDatasetWithPattern(dataset, rank);
-                ComputeExpectedResults(dataset, root, rank);
-
-                // Launch the reduction (1 process per GPU)
-                ncclGather(dataset.inputs[rank],
-                           dataset.outputs[rank],
-                           numElements, dataType,
-                           root, comms[rank], streams[rank]);
-
-                // Wait for reduction to complete
-                HIP_CALL(hipStreamSynchronize(streams[rank]));
-
-                // Check results
-                pass = ValidateResults(dataset, rank, root);
-
-                // Ensure all processes have finished current iteration before proceeding
-                barrier.Wait();
-            }
-
-            TearDownPerProcess(comms[rank], streams[rank]);
-            dataset.Release(rank);
-        }
-    };
-}
-
-#endif
@@ -1,130 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#include "test_GroupCalls.hpp"
-
-#include "test_AllGather.hpp"
-#include "test_AllReduce.hpp"
-#include "test_Broadcast.hpp"
-#include "test_Reduce.hpp"
-#include "test_ReduceScatter.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(GroupCallsCorrectnessTest, Correctness)
-    {
-        if (numDevices > numDevicesAvailable) return;
-
-        // Create multiple datasets for group operation
-        std::vector<Dataset> datasets(5);
-        std::vector<ncclFunc_t> ncclFuncs(5);
-        ncclFuncs.push_back(ncclCollAllGather);
-        ncclFuncs.push_back(ncclCollAllReduce);
-        ncclFuncs.push_back(ncclCollBroadcast);
-        ncclFuncs.push_back(ncclCollReduce);
-        ncclFuncs.push_back(ncclCollReduceScatter);
-
-        // Adjust numElements to be multiple of numDevices
-        numElements = (numElements/numDevices)*numDevices;
-        for (int i = 0; i < datasets.size(); i++)
-        {
-            datasets[i].Initialize(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
-            FillDatasetWithPattern(datasets[i]);
-        }
-
-        // Compute expected results for each dataset in group
-        int const root = 0;
-        AllGatherCorrectnessTest::ComputeExpectedResults(datasets[0]);
-        AllReduceCorrectnessTest::ComputeExpectedResults(datasets[1], op);
-        BroadcastCorrectnessTest::ComputeExpectedResults(datasets[2], root);
-        ReduceCorrectnessTest::ComputeExpectedResults(datasets[3], op, root);
-        ReduceScatterCorrectnessTest::ComputeExpectedResults(datasets[4], op);
-
-        // Start a group call
-        ncclGroupStart();
-
-        // AllGather
-        size_t const byteCount = datasets[0].NumBytes() / numDevices;
-        size_t const elemCount = numElements / numDevices;
-        for (int i = 0; i < numDevices; i++)
-        {
-            ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
-                          datasets[0].outputs[i], elemCount,
-                          dataType, comms[i], streams[i]);
-        }
-
-        // AllReduce
-        for (int i = 0; i < numDevices; i++)
-        {
-            ncclAllReduce(datasets[1].inputs[i], datasets[1].outputs[i],
-                          numElements, dataType, op, comms[i], streams[i]);
-        }
-
-        // Broadcast
-        for (int i = 0; i < numDevices; i++)
-        {
-            ncclBroadcast(datasets[2].inputs[i],
-                          datasets[2].outputs[i],
-                          numElements, dataType,
-                          root, comms[i], streams[i]);
-        }
-
-        // Reduce
-        for (int i = 0; i < numDevices; i++)
-        {
-            ncclReduce(datasets[3].inputs[i],
-                       datasets[3].outputs[i],
-                       numElements, dataType, op,
-                       root, comms[i], streams[i]);
-        }
-
-        // ReduceScatter
-        for (int i = 0; i < numDevices; i++)
-        {
-            ncclReduceScatter(datasets[4].inputs[i],
-                              (int8_t *)datasets[4].outputs[i] + (i * byteCount),
-                              elemCount, dataType, op,
-                              comms[i], streams[i]);
-        }
-
-        // Signal end of group call
-        ncclGroupEnd();
-
-        // Wait for reduction to complete
-        Synchronize();
-
-        // Check results for each collective in the group
-        for (int i = 0; i < 5; i++)
-        {
-            ValidateResults(datasets[i]);
-            datasets[i].Release();
-        }
-    }
-
-    INSTANTIATE_TEST_SUITE_P(GroupCallsCorrectnessSweep,
-                             GroupCallsCorrectnessTest,
-                             testing::Combine(
-                                // Reduction operator (not used)
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(2520, 3026520),
-                                // Number of devices
-                                testing::Range(2,(GTESTS_NUM_GPUS+1)),
-                                // In-place or not
-                                testing::Values(false, true),
-                                testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
-                             CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,17 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef TEST_GROUPCALLS_HPP
-#define TEST_GROUPCALLS_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class GroupCallsCorrectnessTest : public CorrectnessTest {};
-}
-
-#endif
@@ -1,92 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#include "test_GroupCallsMultiProcess.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(GroupCallsMultiProcessCorrectnessTest, Correctness)
-    {
-        // Important: Make sure the order of ncclFunc_t's here match the order of ncclFunc_ts
-        // as they appear in TestGroupCalls()
-        std::vector<ncclFunc_t> ncclFuncs;
-        ncclFuncs.push_back(ncclCollAllGather);
-        ncclFuncs.push_back(ncclCollAllReduce);
-        ncclFuncs.push_back(ncclCollBroadcast);
-        ncclFuncs.push_back(ncclCollReduce);
-        ncclFuncs.push_back(ncclCollReduceScatter);
-
-        // Create multiple datasets for combined operation
-        std::vector<Dataset*> datasets(ncclFuncs.size());
-        for (int i = 0; i < datasets.size(); i++)
-        {
-            datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
-            datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
-        }
-
-        int const numGpusPerProcess = 2;
-        int const numProcesses = numDevices / numGpusPerProcess;
-        std::vector<int> pids(numProcesses);
-        int process = -1;
-
-        for (int i = 0; i < numDevices; i+= numGpusPerProcess)
-        {
-            process++;
-            int pid = fork();
-            if (pid == 0)
-            {
-                int gpuIdx = i;
-                int maxIdx = gpuIdx + (numGpusPerProcess - 1) >= numDevices ? numDevices : gpuIdx + numGpusPerProcess;
-
-                std::vector<int> ranks;
-                for (; gpuIdx < maxIdx; gpuIdx++)
-                {
-                    ranks.push_back(gpuIdx);
-                }
-
-                bool pass;
-                TestGroupCalls(process, ranks, datasets, ncclFuncs, pass);
-                TerminateChildProcess(pass);
-            }
-            else
-            {
-                pids[process] = pid;
-            }
-        }
-
-        ValidateProcesses(pids);
-
-        for (int i = 0; i < datasets.size(); i++)
-        {
-            datasets[i]->ReleaseRootProcess();
-            munmap(datasets[i], sizeof(Dataset));
-        }
-    }
-
-    INSTANTIATE_TEST_SUITE_P(GroupCallsMultiProcessCorrectnessSweep,
-                            GroupCallsMultiProcessCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator (not used)
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(3072, 3145728),
-                                // Number of devices
-                                testing::Values(4,8),
-                                // In-place or not
-                                testing::Values(false, true),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,148 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef TEST_GROUPCALLS_MULTI_PROCESS_HPP
-#define TEST_GROUPCALLS_MULTI_PROCESS_HPP
-
-#include "CorrectnessTest.hpp"
-#include "test_AllGatherMultiProcess.hpp"
-#include "test_AllReduceMultiProcess.hpp"
-#include "test_BroadcastMultiProcess.hpp"
-#include "test_ReduceMultiProcess.hpp"
-#include "test_ReduceScatterMultiProcess.hpp"
-
-#include <string>
-
-namespace CorrectnessTests
-{
-    class GroupCallsMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
-    {
-    public:
-        void TestGroupCalls(int process, std::vector<int> const& ranks, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs, bool& pass)
-        {
-            ncclGroupStart();
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                SetUpPerProcess(ranks[i], funcs, comms[ranks[i]], streams[ranks[i]], datasets);
-                if (numDevices > numDevicesAvailable)
-                {
-                    break;
-                }
-            }
-            ncclGroupEnd();
-
-            if (numDevices > numDevicesAvailable)
-            {
-                pass = true;
-                return;
-            }
-
-            int numProcesses = numDevices / ranks.size();
-            Barrier barrier(process, numProcesses, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
-
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                for (int j = 0; j < datasets.size(); j++)
-                {
-                    FillDatasetWithPattern(*datasets[j], ranks[i]);
-                }
-            }
-
-            int const root = 0;
-
-            AllGatherMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[0], barrier, numDevices, ranks);
-            AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[1], barrier, op, ranks);
-            BroadcastMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[2], root, ranks);
-            ReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[3], barrier, op, root, ranks);
-            ReduceScatterMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[4], barrier, op, ranks);
-            barrier.Wait();
-
-            size_t const byteCount = datasets[0]->NumBytes() / numDevices;
-            size_t const elemCount = numElements / numDevices;
-
-            ncclGroupStart();
-            // AllGather
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                int rank = ranks[i];
-                ncclAllGather((int8_t *)datasets[0]->inputs[rank] + (rank * byteCount),
-                              datasets[0]->outputs[rank], elemCount,
-                              dataType, comms[rank], streams[rank]);
-            }
-
-            // AllReduce
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                int rank = ranks[i];
-                ncclAllReduce(datasets[1]->inputs[rank], datasets[1]->outputs[rank],
-                              numElements, dataType, op, comms[rank], streams[rank]);
-            }
-
-            // Broadcast
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                int rank = ranks[i];
-                ncclBroadcast(datasets[2]->inputs[rank],
-                              datasets[2]->outputs[rank],
-                              numElements, dataType,
-                              root, comms[rank], streams[rank]);
-            }
-
-            // Reduce
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                int rank = ranks[i];
-                ncclReduce(datasets[3]->inputs[rank],
-                           datasets[3]->outputs[rank],
-                           numElements, dataType, op,
-                           root, comms[rank], streams[rank]);
-            }
-
-            // ReduceScatter
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                int rank = ranks[i];
-                ncclReduceScatter(datasets[4]->inputs[rank],
-                                  (int8_t *)datasets[4]->outputs[rank] + (rank * byteCount),
-                                  elemCount, dataType, op,
-                                  comms[rank], streams[rank]);
-            }
-
-            // Signal end of group call
-            ncclGroupEnd();
-
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                HIP_CALL(hipSetDevice(ranks[i]));
-                HIP_CALL(hipStreamSynchronize(streams[ranks[i]]));
-            }
-
-            for (int i = 0; i < funcs.size(); i++)
-            {
-                for (int j = 0; j < ranks.size(); j++)
-                {
-                    pass = ValidateResults(*datasets[i], ranks[j], root);
-                    if (!pass)
-                    {
-                        break;
-                    }
-                }
-                barrier.Wait();
-                for (int j = 0; j < ranks.size(); j++)
-                {
-                    datasets[i]->Release(ranks[j]);
-                }
-            }
-
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                TearDownPerProcess(comms[ranks[i]], streams[ranks[i]]);
-            }
-        }
-    };
-}
-
-#endif
@@ -1,71 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_Reduce.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(ReduceCorrectnessTest, Correctness)
-    {
-        if (numDevices > numDevicesAvailable) return;
-
-        // Allocate data
-        Dataset dataset;
-        dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollReduce);
-
-        // Test each possible root
-        for (int root = 0; root < numDevices; root++)
-        {
-            // Prepare input / output / expected results
-            FillDatasetWithPattern(dataset);
-            ComputeExpectedResults(dataset, op, root);
-
-            // Launch the reduction (1 thread per GPU)
-            ncclGroupStart();
-            for (int i = 0; i < numDevices; i++)
-            {
-                ncclReduce(dataset.inputs[i],
-                           dataset.outputs[i],
-                           numElements, dataType, op,
-                           root, comms[i], streams[i]);
-            }
-            ncclGroupEnd();
-
-            // Wait for reduction to complete
-            Synchronize();
-
-            // Check results
-            ValidateResults(dataset);
-        }
-
-        dataset.Release();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(ReduceCorrectnessSweep,
-                             ReduceCorrectnessTest,
-                             testing::Combine(
-                                // Reduction operator
-                                testing::Values(ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(1024, 1048576),
-                                // Number of devices
-                                testing::Range(2,(GTESTS_NUM_GPUS+1)),
-                                // In-place or not
-                                testing::Values(false, true),
-                                testing::Values("")),
-                             CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,87 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_REDUCE_HPP
-#define TEST_REDUCE_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class ReduceCorrectnessTest : public CorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op, int const root)
-        {
-            // Copy all inputs to expected arrays temporarily to perform reduction on host
-            for (int i = 0; i < dataset.numDevices; i++)
-                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
-                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
-
-            // Allocate temporary host array to accumulate results
-            int8_t*   resultI1 = (int8_t   *)malloc(dataset.NumBytes());
-            uint8_t*  resultU1 = (uint8_t  *)resultI1;
-            int32_t*  resultI4 = (int32_t  *)resultI1;
-            uint32_t* resultU4 = (uint32_t *)resultI1;
-            int64_t*  resultI8 = (int64_t  *)resultI1;
-            uint64_t* resultU8 = (uint64_t *)resultI1;
-            float*    resultF4 = (float    *)resultI1;
-            double*   resultF8 = (double   *)resultI1;
-            rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
-
-            // Initialize the result with the first device's array
-            memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
-            ncclRedOp_t red_op = ((op == ncclAvg) ? ncclSum : op);
-
-            // Perform reduction on the other device arrays
-            for (int i = 1; i < dataset.numDevices; i++)
-            {
-                int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
-                uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
-                int32_t*  arrayI4 = (int32_t  *)arrayI1;
-                uint32_t* arrayU4 = (uint32_t *)arrayI1;
-                int64_t*  arrayI8 = (int64_t  *)arrayI1;
-                uint64_t* arrayU8 = (uint64_t *)arrayI1;
-                float*    arrayF4 = (float    *)arrayI1;
-                double*   arrayF8 = (double   *)arrayI1;
-                rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
-
-                for (int j = 0; j < dataset.numElements; j++)
-                {
-                    switch (dataset.dataType)
-                    {
-                    case ncclInt8:    resultI1[j] = ReduceOp(red_op, resultI1[j], arrayI1[j]); break;
-                    case ncclUint8:   resultU1[j] = ReduceOp(red_op, resultU1[j], arrayU1[j]); break;
-                    case ncclInt32:   resultI4[j] = ReduceOp(red_op, resultI4[j], arrayI4[j]); break;
-                    case ncclUint32:  resultU4[j] = ReduceOp(red_op, resultU4[j], arrayU4[j]); break;
-                    case ncclInt64:   resultI8[j] = ReduceOp(red_op, resultI8[j], arrayI8[j]); break;
-                    case ncclUint64:  resultU8[j] = ReduceOp(red_op, resultU8[j], arrayU8[j]); break;
-                    case ncclFloat32: resultF4[j] = ReduceOp(red_op, resultF4[j], arrayF4[j]); break;
-                    case ncclFloat64: resultF8[j] = ReduceOp(red_op, resultF8[j], arrayF8[j]); break;
-                    case ncclBfloat16: resultB2[j] = ReduceOp(red_op, resultB2[j], arrayB2[j]); break;
-                    default:
-                        fprintf(stderr, "[ERROR] Unsupported datatype\n");
-                        exit(0);
-                    }
-                }
-            }
-
-            if (op == ncclAvg)
-                Average(dataset, resultI1);
-
-            // Copy results into expected arrays
-            for (int i = 0; i < dataset.numDevices; i++)
-            {
-                if (i == root)
-                    memcpy(dataset.expected[root], resultI1, dataset.NumBytes());
-                else
-                    HIP_CALL(hipMemcpy(dataset.expected[i], dataset.outputs[i], dataset.NumBytes(), hipMemcpyDeviceToHost));
-            }
-            free(resultI1);
-        }
-    };
-}
-
-#endif
@@ -1,61 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_ReduceMultiProcess.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(ReduceMultiProcessCorrectnessTest, Correctness)
-    {
-        dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollReduce);
-        std::vector<int> pids(numDevices);
-
-        int gpu = -1;
-        for (int i = 0; i < numDevices; i++)
-        {
-            gpu++;
-            int pid = fork();
-            if (pid == 0)
-            {
-                bool pass;
-                TestReduce(gpu, *dataset, pass);
-                TerminateChildProcess(pass);
-            }
-            else
-            {
-                pids[gpu] = pid;
-            }
-        }
-
-        ValidateProcesses(pids);
-        dataset->ReleaseRootProcess();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(ReduceMultiProcessCorrectnessSweep,
-                            ReduceMultiProcessCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator
-                                testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(1024, 1048576),
-                                // Number of devices
-                                testing::Values(2,3,4,8),
-                                // In-place or not
-                                testing::Values(false, true),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,131 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_REDUCE_MULTI_PROCESS_HPP
-#define TEST_REDUCE_MULTI_PROCESS_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class ReduceMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, int const root, std::vector<int> const& ranks)
-        {
-            // Copy all inputs to expected arrays temporarily to perform reduction on host
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                int rank = ranks[i];
-                HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
-                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
-            }
-            barrier.Wait();
-
-            for (int h = 0; h < ranks.size(); h++)
-            {
-                int rank = ranks[h];
-                if (rank == root)
-                {
-                    // Allocate temporary host array to accumulate results
-                    int8_t*   resultI1 = (int8_t   *)malloc(dataset.NumBytes());
-                    uint8_t*  resultU1 = (uint8_t  *)resultI1;
-                    int32_t*  resultI4 = (int32_t  *)resultI1;
-                    uint32_t* resultU4 = (uint32_t *)resultI1;
-                    int64_t*  resultI8 = (int64_t  *)resultI1;
-                    uint64_t* resultU8 = (uint64_t *)resultI1;
-                    float*    resultF4 = (float    *)resultI1;
-                    double*   resultF8 = (double   *)resultI1;
-                    rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
-
-                    // Initialize the result with the first device's array
-                    memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
-
-                    // Perform reduction on the other device arrays
-                    for (int i = 1; i < dataset.numDevices; i++)
-                    {
-                        int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
-                        uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
-                        int32_t*  arrayI4 = (int32_t  *)arrayI1;
-                        uint32_t* arrayU4 = (uint32_t *)arrayI1;
-                        int64_t*  arrayI8 = (int64_t  *)arrayI1;
-                        uint64_t* arrayU8 = (uint64_t *)arrayI1;
-                        float*    arrayF4 = (float    *)arrayI1;
-                        double*   arrayF8 = (double   *)arrayI1;
-                        rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
-
-                        for (int j = 0; j < dataset.numElements; j++)
-                        {
-                            switch (dataset.dataType)
-                            {
-                            case ncclInt8:    resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
-                            case ncclUint8:   resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
-                            case ncclInt32:   resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
-                            case ncclUint32:  resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
-                            case ncclInt64:   resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
-                            case ncclUint64:  resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
-                            case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
-                            case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
-                            case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
-                            default:
-                                fprintf(stderr, "[ERROR] Unsupported datatype\n");
-                                exit(0);
-                            }
-                        }
-                    }
-                    memcpy(dataset.expected[root], resultI1, dataset.NumBytes());
-                    free(resultI1);
-                }
-            }
-            barrier.Wait();
-
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                int rank = ranks[i];
-                if (rank != root)
-                {
-                    HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.outputs[rank], dataset.NumBytes(), hipMemcpyDeviceToHost));
-                }
-            }
-        }
-
-        void TestReduce(int rank, Dataset& dataset, bool& pass)
-        {
-            SetUpPerProcess(rank, ncclCollReduce, comms[rank], streams[rank], dataset);
-
-            if (numDevices > numDevicesAvailable)
-            {
-                pass = true;
-                return;
-            }
-
-            Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
-
-            // Test each possible root
-            for (int root = 0; root < numDevices; root++)
-            {
-                // Prepare input / output / expected results
-                FillDatasetWithPattern(dataset, rank);
-                ComputeExpectedResults(dataset, barrier, op, root, std::vector<int>(1, rank));
-                // Launch the reduction (1 process per GPU)
-                ncclResult_t res = ncclReduce(dataset.inputs[rank],
-                                              dataset.outputs[rank],
-                                              numElements, dataType, op,
-                                              root, comms[rank], streams[rank]);
-                // Wait for reduction to complete
-                HIP_CALL(hipStreamSynchronize(streams[rank]));
-                // Check results
-                pass = ValidateResults(dataset, rank);
-                // Ensure all processes have finished current iteration before proceeding
-                barrier.Wait();
-            }
-
-            TearDownPerProcess(comms[rank], streams[rank]);
-            dataset.Release(rank);
-        }
-    };
-}
-
-#endif
@@ -1,71 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_ReduceScatter.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(ReduceScatterCorrectnessTest, Correctness)
-    {
-         // Adjust numElements to be multiple of numDevices
-        numElements = (numElements/numDevices)*numDevices;
-        if (numDevices > numDevicesAvailable) return;
-        if (numElements % numDevices != 0) return;
-
-        // Prepare input / output / expected results
-        Dataset dataset;
-        dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollReduceScatter);
-        FillDatasetWithPattern(dataset);
-        ComputeExpectedResults(dataset, op);
-
-        size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
-        size_t const recvCount = dataset.numElements / dataset.numDevices;
-
-        // Launch the reduction (1 thread per GPU)
-        ncclGroupStart();
-        for (int i = 0; i < numDevices; i++)
-        {
-            ncclReduceScatter(dataset.inputs[i],
-                              (int8_t *)dataset.outputs[i] + (i * byteCount),
-                              recvCount, dataType, op,
-                              comms[i], streams[i]);
-        }
-        ncclGroupEnd();
-
-        // Wait for reduction to complete
-        Synchronize();
-
-        // Check results
-        ValidateResults(dataset);
-
-        dataset.Release();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(ReduceScatterCorrectnessSweep,
-                             ReduceScatterCorrectnessTest,
-                             testing::Combine(
-                                // Reduction operator
-                                testing::Values(ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(2520, 3026520),
-                                // Number of devices
-                                testing::Range(2,(GTESTS_NUM_GPUS+1)),
-                                // In-place or not
-                                testing::Values(false, true),
-                                testing::Values("")),
-                             CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,90 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_REDUCE_SCATTER_HPP
-#define TEST_REDUCE_SCATTER_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class ReduceScatterCorrectnessTest : public CorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset, ncclRedOp_t const op)
-        {
-            // Copy all inputs to expected arrays temporarily to perform reduction on host
-            for (int i = 0; i < dataset.numDevices; i++)
-                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.inputs[i],
-                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
-
-            // Allocate temporary host array to accumulate results
-            int8_t*   resultI1 = (int8_t   *)malloc(dataset.NumBytes());
-            uint8_t*  resultU1 = (uint8_t  *)resultI1;
-            int32_t*  resultI4 = (int32_t  *)resultI1;
-            uint32_t* resultU4 = (uint32_t *)resultI1;
-            int64_t*  resultI8 = (int64_t  *)resultI1;
-            uint64_t* resultU8 = (uint64_t *)resultI1;
-            float*    resultF4 = (float    *)resultI1;
-            double*   resultF8 = (double   *)resultI1;
-            rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
-
-            // Initialize the result with the first device's array
-            memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
-            ncclRedOp_t red_op = ((op == ncclAvg) ? ncclSum : op);
-
-            // Perform reduction on the other device arrays
-            for (int i = 1; i < dataset.numDevices; i++)
-            {
-                int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
-                uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
-                int32_t*  arrayI4 = (int32_t  *)arrayI1;
-                uint32_t* arrayU4 = (uint32_t *)arrayI1;
-                int64_t*  arrayI8 = (int64_t  *)arrayI1;
-                uint64_t* arrayU8 = (uint64_t *)arrayI1;
-                float*    arrayF4 = (float    *)arrayI1;
-                double*   arrayF8 = (double   *)arrayI1;
-                rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
-
-                for (int j = 0; j < dataset.numElements; j++)
-                {
-                    switch (dataset.dataType)
-                    {
-                    case ncclInt8:    resultI1[j] = ReduceOp(red_op, resultI1[j], arrayI1[j]); break;
-                    case ncclUint8:   resultU1[j] = ReduceOp(red_op, resultU1[j], arrayU1[j]); break;
-                    case ncclInt32:   resultI4[j] = ReduceOp(red_op, resultI4[j], arrayI4[j]); break;
-                    case ncclUint32:  resultU4[j] = ReduceOp(red_op, resultU4[j], arrayU4[j]); break;
-                    case ncclInt64:   resultI8[j] = ReduceOp(red_op, resultI8[j], arrayI8[j]); break;
-                    case ncclUint64:  resultU8[j] = ReduceOp(red_op, resultU8[j], arrayU8[j]); break;
-                    case ncclFloat32: resultF4[j] = ReduceOp(red_op, resultF4[j], arrayF4[j]); break;
-                    case ncclFloat64: resultF8[j] = ReduceOp(red_op, resultF8[j], arrayF8[j]); break;
-                    case ncclBfloat16: resultB2[j] = ReduceOp(red_op, resultB2[j], arrayB2[j]); break;
-                    default:
-                        fprintf(stderr, "[ERROR] Unsupported datatype\n");
-                        exit(0);
-                    }
-                }
-            }
-
-            if (op == ncclAvg)
-                Average(dataset, resultI1);
-
-            // Copy results into expected arrays
-            size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
-
-            for (int i = 0; i < dataset.numDevices; i++)
-                HIP_CALL(hipMemcpy(dataset.expected[i], dataset.outputs[i],
-                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
-
-            for (int i = 0; i < dataset.numDevices; i++)
-                memcpy((int8_t *)dataset.expected[i] + (i * byteCount),
-                       resultI1 + (i * byteCount), byteCount);
-
-            free(resultI1);
-        }
-    };
-}
-
-#endif
@@ -1,61 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_ReduceScatterMultiProcess.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(ReduceScatterMultiProcessCorrectnessTest, Correctness)
-    {
-        dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollReduceScatter);
-        std::vector<int> pids(numDevices);
-
-        int gpu = -1;
-        for (int i = 0; i < numDevices; i++)
-        {
-            gpu++;
-            int pid = fork();
-            if (pid == 0)
-            {
-                bool pass;
-                TestReduceScatter(gpu, *dataset, pass);
-                TerminateChildProcess(pass);
-            }
-            else
-            {
-                pids[gpu] = pid;
-            }
-        }
-
-        ValidateProcesses(pids);
-        dataset->ReleaseRootProcess();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(ReduceScatterMultiProcessCorrectnessSweep,
-                            ReduceScatterMultiProcessCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator
-                                testing::Values(ncclSum, ncclProd, ncclMax, ncclMin),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(3072, 3145728),
-                                // Number of devices
-                                testing::Values(2,3,4,8),
-                                // In-place or not
-                                testing::Values(false, true),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,146 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_REDUCE_SCATTER_MULTI_PROCESS_HPP
-#define TEST_REDUCE_SCATTER_MULTI_PROCESS_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class ReduceScatterMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, std::vector<int> const& ranks)
-        {
-            // Copy all inputs to expected arrays temporarily to perform reduction on host
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                int rank = ranks[i];
-                HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
-                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
-            }
-            barrier.Wait();
-
-            // Have rank 0 do the expected calculation, then send results to other processes
-            int8_t* resultI1;
-            for (int h = 0; h < ranks.size(); h++)
-            {
-                int rank = ranks[h];
-                if (rank == 0)
-                {
-                    // Allocate temporary host array to accumulate results
-                    resultI1           = (int8_t   *)malloc(dataset.NumBytes());
-                    uint8_t*  resultU1 = (uint8_t  *)resultI1;
-                    int32_t*  resultI4 = (int32_t  *)resultI1;
-                    uint32_t* resultU4 = (uint32_t *)resultI1;
-                    int64_t*  resultI8 = (int64_t  *)resultI1;
-                    uint64_t* resultU8 = (uint64_t *)resultI1;
-                    float*    resultF4 = (float    *)resultI1;
-                    double*   resultF8 = (double   *)resultI1;
-                    rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
-
-                    // Initialize the result with the first device's array
-                    memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
-
-                    // Perform reduction on the other device arrays
-                    for (int i = 1; i < dataset.numDevices; i++)
-                    {
-                        int8_t*   arrayI1 = (int8_t   *)dataset.expected[i];
-                        uint8_t*  arrayU1 = (uint8_t  *)arrayI1;
-                        int32_t*  arrayI4 = (int32_t  *)arrayI1;
-                        uint32_t* arrayU4 = (uint32_t *)arrayI1;
-                        int64_t*  arrayI8 = (int64_t  *)arrayI1;
-                        uint64_t* arrayU8 = (uint64_t *)arrayI1;
-                        float*    arrayF4 = (float    *)arrayI1;
-                        double*   arrayF8 = (double   *)arrayI1;
-                        rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
-
-                        for (int j = 0; j < dataset.numElements; j++)
-                        {
-                            switch (dataset.dataType)
-                            {
-                            case ncclInt8:    resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
-                            case ncclUint8:   resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
-                            case ncclInt32:   resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
-                            case ncclUint32:  resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
-                            case ncclInt64:   resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
-                            case ncclUint64:  resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
-                            case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
-                            case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
-                            case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
-                            default:
-                                fprintf(stderr, "[ERROR] Unsupported datatype\n");
-                                exit(0);
-                            }
-                        }
-                    }
-                }
-            }
-            barrier.Wait();
-            // Copy results into expected arrays
-            size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
-
-            for (int i = 0; i < ranks.size(); i++)
-            {
-                int rank = ranks[i];
-                HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.outputs[rank],
-                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
-            }
-            barrier.Wait();
-
-            for (int h = 0; h < ranks.size(); h++)
-            {
-                int rank = ranks[h];
-                if (rank == 0)
-                {
-                    for (int i = 0; i < dataset.numDevices; i++)
-                        memcpy((int8_t *)dataset.expected[i] + (i * byteCount),
-                               resultI1 + (i * byteCount), byteCount);
-
-                    free(resultI1);
-                }
-            }
-        }
-
-        void TestReduceScatter(int rank, Dataset& dataset, bool& pass)
-        {
-            // Prepare input / output / expected results
-            SetUpPerProcess(rank, ncclCollAllGather, comms[rank], streams[rank], dataset);
-
-            if (numDevices > numDevicesAvailable || numElements % numDevices != 0)
-            {
-                pass = true;
-                return;
-            }
-
-            Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
-
-            // Prepare input / output / expected results
-            FillDatasetWithPattern(dataset, rank);
-            ComputeExpectedResults(dataset, barrier, op, std::vector<int>(1, rank));
-
-            size_t const byteCount = dataset.NumBytes() / numDevices;
-            size_t const recvCount = dataset.numElements / numDevices;
-
-            // Launch the reduction (1 process per GPU)
-            ncclReduceScatter(dataset.inputs[rank],
-                              (int8_t *)dataset.outputs[rank] + (rank * byteCount),
-                              recvCount, dataType, op,
-                              comms[rank], streams[rank]);
-
-            // Wait for reduction to complete
-            HIP_CALL(hipStreamSynchronize(streams[rank]));
-
-            // Check results
-            pass = ValidateResults(dataset, rank);
-
-            TearDownPerProcess(comms[rank], streams[rank]);
-            dataset.Release(rank);
-        }
-    };
-}
-
-#endif
@@ -1,71 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_Scatter.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(ScatterCorrectnessTest, Correctness)
-    {
-        if (numDevices > numDevicesAvailable) return;
-
-        // Allocate data
-        Dataset dataset;
-        dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollScatter);
-
-        // Test each possible root
-        for (int root = 0; root < numDevices; root++)
-        {
-            // Prepare input / output / expected results
-            FillDatasetWithPattern(dataset);
-            ComputeExpectedResults(dataset, root);
-
-            // Launch the reduction (1 thread per GPU)
-            ncclGroupStart();
-            for (int i = 0; i < numDevices; i++)
-            {
-                ncclScatter(dataset.inputs[i],
-                              dataset.outputs[i],
-                              numElements, dataType,
-                              root, comms[i], streams[i]);
-            }
-            ncclGroupEnd();
-
-            // Wait for reduction to complete
-            Synchronize();
-
-            // Check results
-            ValidateResults(dataset);
-        }
-
-        dataset.Release();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(ScatterCorrectnessSweep,
-                            ScatterCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator is not used
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(1024, 1048576),
-                                // Number of devices
-                                testing::Range(2,(GTESTS_NUM_GPUS+1)),
-                                // In-place or not
-                                testing::Values(false),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,25 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_SCATTER_HPP
-#define TEST_SCATTER_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class ScatterCorrectnessTest : public CorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset, int const root)
-        {
-            for (int i = 0; i < dataset.numDevices; i++)
-                HIP_CALL(hipMemcpy(dataset.expected[i], (int8_t *)dataset.inputs[root]+dataset.NumBytes()*i,
-                                   dataset.NumBytes(), hipMemcpyDeviceToHost));
-        }
-    };
-}
-
-#endif
@@ -1,61 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "test_ScatterMultiProcess.hpp"
-
-namespace CorrectnessTests
-{
-    TEST_P(ScatterMultiProcessCorrectnessTest, Correctness)
-    {
-        dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollScatter);
-        std::vector<int> pids(numDevices);
-
-        int gpu = -1;
-        for (int i = 0; i < numDevices; i++)
-        {
-            gpu++;
-            int pid = fork();
-            if (pid == 0)
-            {
-                bool pass;
-                TestScatter(gpu, *dataset, pass);
-                TerminateChildProcess(pass);
-            }
-            else
-            {
-                pids[gpu] = pid;
-            }
-        }
-
-        ValidateProcesses(pids);
-        dataset->ReleaseRootProcess();
-    }
-
-    INSTANTIATE_TEST_SUITE_P(ScatterMultiProcessCorrectnessSweep,
-                            ScatterMultiProcessCorrectnessTest,
-                            testing::Combine(
-                                // Reduction operator is not used
-                                testing::Values(ncclSum),
-                                // Data types
-                                testing::Values(ncclInt8,
-                                                ncclUint8,
-                                                ncclInt32,
-                                                ncclUint32,
-                                                ncclInt64,
-                                                ncclUint64,
-                                                //ncclFloat16,
-                                                ncclFloat32,
-                                                ncclFloat64,
-                                                ncclBfloat16),
-                                // Number of elements
-                                testing::Values(1024, 1048576),
-                                // Number of devices
-                                testing::Values(2,3,4,8),
-                                // In-place or not
-                                testing::Values(false),
-                                testing::Values("")),
-                            CorrectnessTest::PrintToStringParamName());
-} // namespace
@@ -1,68 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef TEST_SCATTER_MULTI_PROCESS_HPP
-#define TEST_SCATTER_MULTI_PROCESS_HPP
-
-#include "CorrectnessTest.hpp"
-
-namespace CorrectnessTests
-{
-    class ScatterMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
-    {
-    public:
-        static void ComputeExpectedResults(Dataset& dataset, int const root, int const rank)
-        {
-            if (rank == root)
-            {
-                for (int i = 0; i < dataset.numDevices; i++)
-                    HIP_CALL(hipMemcpy(dataset.expected[i], (int8_t *)dataset.inputs[root]+dataset.NumBytes()*i,
-                                       dataset.NumBytes(), hipMemcpyDeviceToHost));
-            }
-        }
-
-        void TestScatter(int rank, Dataset& dataset, bool& pass)
-        {
-            // Prepare input / output / expected results
-            SetUpPerProcess(rank, ncclCollScatter, comms[rank], streams[rank], dataset);
-
-            if (numDevices > numDevicesAvailable)
-            {
-                pass = true;
-                return;
-            }
-
-            Barrier barrier(rank, numDevices, StripPortNumberFromCommId(std::string(getenv("NCCL_COMM_ID"))));
-
-            // Test each possible root
-            for (int root = 0; root < numDevices; root++)
-            {
-                // Prepare input / output / expected results
-                FillDatasetWithPattern(dataset, rank);
-
-                ComputeExpectedResults(dataset, root, rank);
-
-                // Launch the reduction (1 process per GPU)
-                ncclScatter(dataset.inputs[rank],
-                            dataset.outputs[rank],
-                            numElements, dataType,
-                            root, comms[rank], streams[rank]);
-
-                // Wait for reduction to complete
-                HIP_CALL(hipStreamSynchronize(streams[rank]));
-
-                // Check results
-                pass = ValidateResults(dataset, rank);
-
-                barrier.Wait();
-            }
-
-            TearDownPerProcess(comms[rank], streams[rank]);
-            dataset.Release(rank);
-        }
-    };
-}
-
-#endif