Added alltoallv test and optional args variable on collective args (#514)

* Added alltoallv test and optional args variable on collective args [ROCm/rccl commit: 65ea3d80db]
2022-03-18 13:55:11 -04:00
commit 3493750b6b
@@ -48,7 +48,7 @@ option(BUILD_TESTS "Build test programs" OFF)
 option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
 option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF)
 option(BUILD_ALLREDUCE_ONLY "Build AllReduce + sum + float kernel only" OFF)
-#Set the header wrapper ON by default. 
+#Set the header wrapper ON by default.
 option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" ON)

 # parse version from Makefile NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH must exist
@@ -292,7 +292,7 @@ if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY)
            GUARDS SYMLINK WRAPPER
 	    WRAPPER_LOCATIONS rccl )
  #install the wrapper header file to package
-  rocm_install( FILES "${PROJECT_BINARY_DIR}/rccl/include/rccl.h" 
+  rocm_install( FILES "${PROJECT_BINARY_DIR}/rccl/include/rccl.h"
 	        DESTINATION "./rccl/include/" )
 endif()

@@ -16,11 +16,11 @@ namespace RcclUnitTesting
    std::vector<ncclDataType_t> const& dataTypes       = {ncclFloat};
    std::vector<ncclRedOp_t>    const& redOps          = {ncclSum};
    std::vector<int>            const  numElements     = {1048576, 53327, 1024};
-    int                         const  root            = 0;
    bool                        const  inPlace         = false;
    bool                        const  useManagedMem   = false;
    int                         const  numCollPerGroup = numElements.size();

+    OptionalColArgs options;
    // This tests runs 3 collectives in the same group call
    bool isCorrect = true;
    for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
@@ -33,30 +33,32 @@ namespace RcclUnitTesting
      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);

      for (int redOpIdx = 0; redOpIdx < redOps.size() && isCorrect; ++redOpIdx)
-      for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
      {
-        if (testBed.ev.showNames)
-          INFO("%s %d-ranks AllReduce %d Grouped Calls (%s-%s)\n",
-               isMultiProcess ? "MP" : "SP",
-               totalRanks, numCollPerGroup,
-               ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]);
-
-        // Run all element sizes in parallel as single group
-        for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx)
+        options.redOp = redOps[redOpIdx];
+        for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
        {
-          testBed.SetCollectiveArgs(funcType,
-                                    dataTypes[dataIdx],
-                                    redOps[redOpIdx],
-                                    root,
-                                    numElements[collIdx],
-                                    numElements[collIdx],
-                                    collIdx);
+          if (testBed.ev.showNames)
+            INFO("%s %d-ranks AllReduce %d Grouped Calls (%s-%s)\n",
+                 isMultiProcess ? "MP" : "SP",
+                 totalRanks, numCollPerGroup,
+                 ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]);
+
+          // Run all element sizes in parallel as single group
+          for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx)
+          {
+            testBed.SetCollectiveArgs(funcType,
+                                      dataTypes[dataIdx],
+                                      numElements[collIdx],
+                                      numElements[collIdx],
+                                      options,
+                                      collIdx);
+          }
+          testBed.AllocateMem(inPlace, useManagedMem);
+          testBed.PrepareData();
+          testBed.ExecuteCollectives();
+          testBed.ValidateResults(isCorrect);
+          testBed.DeallocateMem();
        }
-        testBed.AllocateMem(inPlace, useManagedMem);
-        testBed.PrepareData();
-        testBed.ExecuteCollectives();
-        testBed.ValidateResults(isCorrect);
-        testBed.DeallocateMem();
      }
      testBed.DestroyComms();
    }
@@ -17,10 +17,10 @@ namespace RcclUnitTesting
    std::vector<ncclDataType_t> const& dataTypes     = {ncclInt32, ncclFloat32, ncclFloat64};
    ncclRedOp_t                 const  redOp         = ncclSum;
    std::vector<int>            const  numElements   = {1048576, 1024};
-    int                         const  root          = 0;
    bool                        const  inPlace       = false;
    bool                        const  useManagedMem = false;

+    OptionalColArgs options;
    // Terminate the test as soon as first failure occurs
    bool isCorrect = true;
    for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
@@ -43,6 +43,8 @@ namespace RcclUnitTesting
          double F = i;
          scalarsPerRank.Set(dataType, i, i, F);
        }
+        int const numBytes = totalRanks * DataTypeToBytes(dataType);
+        memcpy(options.scalarTransport.ptr, scalarsPerRank.ptr, numBytes);

        // Test various scalar residence modes
        for (int scalarMode = 0; scalarMode <= 1 && isCorrect; ++scalarMode)
@@ -54,9 +56,11 @@ namespace RcclUnitTesting

          for (int i = 0; i < numElements.size() && isCorrect; ++i)
          {
-            testBed.SetCollectiveArgs(funcType, dataType, redOp, root,
+            options.scalarMode = scalarMode;
+            options.redOp = redOp;
+            testBed.SetCollectiveArgs(funcType, dataType,
                                      numElements[i], numElements[i],
-                                      -1, -1, scalarsPerRank, scalarMode);
+                                      options);
            // For performance, only allocate and prepare data on largest size
            if (i == 0)
            {
@@ -0,0 +1,96 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+
+  void sendRecvPrep(size_t numInputElementsArray[],
+                    size_t numOutputElementsArray[],
+                    OptionalColArgs &options,
+                    int totalRanks, int numElementsBase)
+  {
+    for (int sendRank = 0; sendRank < totalRanks; ++sendRank)
+    for (int recvRank  = 0; recvRank   < totalRanks; ++recvRank )
+    {
+      //create send counts, and build other arrays from that
+      options.sendcounts[sendRank*totalRanks+recvRank] = numElementsBase * (recvRank + 1);
+      options.recvcounts[recvRank*totalRanks+sendRank] = options.sendcounts[sendRank*totalRanks+recvRank ];
+    }
+
+    for (int sendRank = 0; sendRank < totalRanks; ++sendRank)
+    {
+      options.sdispls[sendRank*totalRanks] = 0;
+      options.rdispls[sendRank*totalRanks] = 0;
+      for (int recvRank = 1; recvRank < totalRanks; ++recvRank )
+      {
+        options.sdispls[sendRank*totalRanks+recvRank] =
+                        options.sdispls[sendRank*totalRanks+recvRank-1] + options.sendcounts[sendRank*totalRanks+recvRank-1];
+        options.rdispls[sendRank*totalRanks+recvRank] =
+                        options.rdispls[sendRank*totalRanks+recvRank-1] + options.recvcounts[sendRank*totalRanks+recvRank-1];
+      }
+      numInputElementsArray[sendRank] = options.sdispls[(sendRank+1)*totalRanks-1] + options.sendcounts[(sendRank+1)*totalRanks-1];
+      numOutputElementsArray[sendRank] = options.rdispls[(sendRank+1)*totalRanks-1] + options.recvcounts[(sendRank+1)*totalRanks-1];
+    }
+
+  }
+
+  TEST(AllToAllv, OutOfPlace)
+  {
+    TestBed testBed;
+    // Configuration
+    std::vector<ncclDataType_t> const& dataTypes       = {ncclInt32, ncclFloat64};
+    std::vector<int>            const  numElementsBase = {1048576, 53327, 1024};
+    bool                        const  inPlace         = false;
+    bool                        const  useManagedMem   = false;
+
+    OptionalColArgs options;
+    size_t numInputElementsArray[MAX_RANKS], numOutputElementsArray[MAX_RANKS];
+    bool isCorrect = true;
+    for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
+    for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
+    {
+      if (!(testBed.ev.processMask & (1 << isMultiProcess))) continue;
+
+      int const numProcesses = isMultiProcess ? totalRanks : 1;
+      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
+
+      for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
+      for (int numIdx = 0; numIdx < numElementsBase.size() && isCorrect; ++numIdx)
+      {
+        if (testBed.ev.showNames)
+        {
+          std::string name = testBed.GetTestCaseName(totalRanks, isMultiProcess,
+                                                     ncclCollAllToAllv, dataTypes[dataIdx],
+                                                     ncclSum, -1,
+                                                     inPlace, useManagedMem);
+          INFO("%s\n", name.c_str());
+
+        }
+        sendRecvPrep(numInputElementsArray, numOutputElementsArray, options, totalRanks, numElementsBase[numIdx]);
+        for (int rank = 0; rank < totalRanks; ++rank)
+        {
+          testBed.SetCollectiveArgs(ncclCollAllToAllv,
+                                  dataTypes[dataIdx],
+                                  numInputElementsArray[rank],
+                                  numOutputElementsArray[rank],
+                                  options,
+                                  -1,
+                                  rank);
+
+        }
+        testBed.AllocateMem(inPlace, useManagedMem);
+        testBed.PrepareData();
+        testBed.ExecuteCollectives();
+        testBed.ValidateResults(isCorrect);
+        testBed.DeallocateMem();
+
+      }
+      testBed.DestroyComms();
+    }
+    testBed.Finalize();
+  }
+}
@@ -65,6 +65,8 @@ if(BUILD_TESTS)
      #AllToAll
      AllToAll_OutOfPlace.cpp
      AllToAll_ManagedMem.cpp
+      #AllToAllv
+      AllToAllv_OutOfPlace.cpp
      #Broadcast
      Broadcast_InPlace.cpp
      Broadcast_ManagedMem.cpp
@@ -17,6 +17,7 @@ namespace RcclUnitTesting
    bool                        const  inPlace         = false;
    bool                        const  useManagedMem   = false;

+    OptionalColArgs options;
    bool isCorrect = true;
    int totalRanks = testBed.ev.maxGpus;
    for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
@@ -32,12 +33,12 @@ namespace RcclUnitTesting
      {
        for (int recvRank = 0; recvRank  < totalRanks; ++recvRank)
        {
+          options.root = recvRank;
          testBed.SetCollectiveArgs(ncclCollSend,
                                    dataTypes[dataIdx],
-                                    ncclSum,
-                                    recvRank,
                                    numElements[numIdx],
                                    numElements[numIdx],
+                                    options,
                                    0,
                                    sendRank);
          if (recvRank == 0)
@@ -55,18 +56,17 @@ namespace RcclUnitTesting
                  recvRank,
                  numElements[numIdx]);

-
+            options.root = sendRank;
            testBed.SetCollectiveArgs(ncclCollRecv,
                                      dataTypes[dataIdx],
-                                      ncclSum,
-                                      sendRank,
                                      numElements[numIdx],
                                      numElements[numIdx],
+                                      options,
                                      0,
                                      recvRank);
            testBed.AllocateMem(inPlace, useManagedMem, 0, recvRank);
            testBed.PrepareData(0, recvRank);
-            testBed.ExecuteCollectives({sendRank,recvRank });
+            testBed.ExecuteCollectives({sendRank, recvRank});
            testBed.ValidateResults(isCorrect, 0, recvRank);
            testBed.DeallocateMem(0, recvRank);
          }
@@ -14,20 +14,17 @@ namespace RcclUnitTesting
                                  int             const  deviceId,
                                  ncclFunc_t      const  funcType,
                                  ncclDataType_t  const  dataType,
-                                  ncclRedOp_t     const  redOp,
-                                  int             const  root,
                                  size_t          const  numInputElements,
                                  size_t          const  numOutputElements,
-                                  ScalarTransport const  scalarTransport,
-                                  int             const  scalarMode)
+                                  OptionalColArgs const  &optionalColArgs)
  {
    // Free scalar based on previous scalarMode
-    if (scalarMode != -1)
+    if (optionalColArgs.scalarMode != -1)
    {
      if (this->localScalar.ptr != nullptr)
      {
-        if (this->scalarMode == 0) this->localScalar.FreeGpuMem();
-        if (this->scalarMode == 1) hipHostFree(this->localScalar.ptr);
+        if (this->options.scalarMode == 0) this->localScalar.FreeGpuMem();
+        if (this->options.scalarMode == 1) hipHostFree(this->localScalar.ptr);
      }
    }

@@ -36,26 +33,23 @@ namespace RcclUnitTesting
    this->deviceId          = deviceId;
    this->funcType          = funcType;
    this->dataType          = dataType;
-    this->redOp             = redOp;
-    this->root              = root;
    this->numInputElements  = numInputElements;
    this->numOutputElements = numOutputElements;
-    this->scalarTransport   = scalarTransport;
-    this->scalarMode        = scalarMode;
+    this->options           = optionalColArgs;

-    if (scalarMode != -1)
+    if (this->options.scalarMode != -1)
    {
      size_t const numBytes = DataTypeToBytes(dataType);
-      if (scalarMode == ncclScalarDevice)
+      if (this->options.scalarMode == ncclScalarDevice)
      {
        CHECK_CALL(this->localScalar.AllocateGpuMem(numBytes));
-        CHECK_HIP(hipMemcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes),
+        CHECK_HIP(hipMemcpy(this->localScalar.ptr, optionalColArgs.scalarTransport.ptr + (globalRank * numBytes),
                            numBytes, hipMemcpyHostToDevice));
      }
-      else if (scalarMode == ncclScalarHostImmediate)
+      else if (this->options.scalarMode == ncclScalarHostImmediate)
      {
        CHECK_HIP(hipHostMalloc(&this->localScalar.ptr, numBytes, 0));
-        memcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes), numBytes);
+        memcpy(this->localScalar.ptr, optionalColArgs.scalarTransport.ptr + (globalRank * numBytes), numBytes);
      }
    }
    return TEST_SUCCESS;
@@ -116,8 +110,8 @@ namespace RcclUnitTesting
  ErrCode CollectiveArgs::ValidateResults()
  {
    // Ignore non-root outputs for collectives with a root
-    if (CollectiveArgs::UsesRoot(this->funcType) && this->root != this->globalRank) return TEST_SUCCESS;
-
+    if (CollectiveArgs::UsesRoot(this->funcType) && this->options.root != this->globalRank) return TEST_SUCCESS;
+    if (this->funcType == ncclCollSend) return TEST_SUCCESS; // on the send receive pair only recv needs to be checked
    size_t const numOutputBytes = (this->numOutputElements * DataTypeToBytes(this->dataType));

    CHECK_HIP(hipMemcpy(this->outputCpu.ptr, this->outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost));
@@ -153,8 +147,9 @@ namespace RcclUnitTesting

    if (this->localScalar.ptr != nullptr)
    {
-      if (this->scalarMode == 0) this->localScalar.FreeGpuMem();
-      if (this->scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr));
+      if (this->options.scalarMode == 0) this->localScalar.FreeGpuMem();
+      if (this->options.scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr));
+      this->localScalar.Attach(nullptr);
    }
    return TEST_SUCCESS;
  }
@@ -174,6 +169,7 @@ namespace RcclUnitTesting
    case ncclCollGather:        ss << "ncclGather";        break;
    case ncclCollScatter:       ss << "ncclScatter";       break;
    case ncclCollAllToAll:      ss << "ncclAllToAll";      break;
+    case ncclCollAllToAllv:     ss << "ncclAllToAllv";     break;
    case ncclCollSend:          ss << "ncclSend";          break;
    case ncclCollRecv:          ss << "ncclRecv";          break;
    default:                    ss << "[Unknown]";         break;
@@ -184,9 +180,9 @@ namespace RcclUnitTesting
        this->funcType == ncclCollReduceScatter ||
        this->funcType == ncclCollAllReduce)
    {
-      if (this->redOp < ncclNumOps)
+      if (this->options.redOp < ncclNumOps)
      {
-        ss << ncclRedOpNames[this->redOp] << " ";
+        ss << ncclRedOpNames[this->options.redOp] << " ";
      }
      else
      {
@@ -215,13 +211,13 @@ namespace RcclUnitTesting
        this->funcType == ncclCollGather ||
        this->funcType == ncclCollScatter)
    {
-      ss << "Root " << this->root << " ";
+      ss << "Root " << this->options.root << " ";
    }

    if (this->funcType == ncclCollSend ||
        this->funcType == ncclCollRecv)
    {
-      ss << "Peer " << this->root << " ";
+      ss << "Peer " << this->options.root << " ";
    }

    ss << "#In: " << this->numInputElements;
@@ -277,7 +273,6 @@ namespace RcclUnitTesting
    return (funcType == ncclCollBroadcast ||
            funcType == ncclCollReduce    ||
            funcType == ncclCollGather    ||
-            funcType == ncclCollScatter   ||
-            funcType == ncclCollSend); // this is incorrect but it works because in Send root is not root it is the peer
+            funcType == ncclCollScatter);
  }
 }
@@ -22,6 +22,7 @@ namespace RcclUnitTesting
    ncclCollGather,
    ncclCollScatter,
    ncclCollAllToAll,
+    ncclCollAllToAllv,
    ncclCollSend,
    ncclCollRecv,
    ncclNumFuncs
@@ -37,6 +38,7 @@ namespace RcclUnitTesting
    "Gather",
    "Scatter",
    "AllToAll",
+    "AllToAllv",
    "Send",
    "Recv"
  };
@@ -72,6 +74,19 @@ namespace RcclUnitTesting
    char ptr[MAX_RANKS * sizeof(double)];
  };

+  struct OptionalColArgs
+  {
+    ncclRedOp_t     redOp = ncclSum;
+    int             root = 0; // Used as "peer" for Send/Recv
+    ScalarTransport scalarTransport;   // Used for custom reduction operators
+    int             scalarMode = -1; // -1 if scalar not used
+    // allToAllv args
+    size_t          sendcounts[MAX_RANKS*MAX_RANKS];
+    size_t          sdispls[MAX_RANKS*MAX_RANKS];
+    size_t          recvcounts[MAX_RANKS*MAX_RANKS];
+    size_t          rdispls[MAX_RANKS*MAX_RANKS];
+  };
+
  // Function pointer for functions that operate on CollectiveArgs
  // e.g. For filling input / computing expected results
  typedef ErrCode (*CollFuncPtr)(CollectiveArgs &);
@@ -85,13 +100,10 @@ namespace RcclUnitTesting
    int             deviceId;
    ncclFunc_t      funcType;
    ncclDataType_t  dataType;
-    ncclRedOp_t     redOp;
-    int             root;              // Used as "peer" for Send/Recv
    size_t          numInputElements;
    size_t          numOutputElements;
-    ScalarTransport scalarTransport;   // Used for custom reduction operators
    PtrUnion        localScalar;
-    int             scalarMode;        // -1 if scalar not used
+    OptionalColArgs options;

    // Data
    PtrUnion       inputGpu;
@@ -111,12 +123,9 @@ namespace RcclUnitTesting
                    int             const deviceId,
                    ncclFunc_t      const funcType,
                    ncclDataType_t  const dataType,
-                    ncclRedOp_t     const redOp,
-                    int             const root,
                    size_t          const numInputElements,
                    size_t          const numOutputElements,
-                    ScalarTransport const scalarsPerRank,
-                    int             const scalarMode = -1);
+                    OptionalColArgs const &optionalArgs = {});

    // Allocates GPU memory for input/output and CPU memory for expected
    // When inPlace is true, input and output share the same memory
@@ -23,6 +23,7 @@ namespace RcclUnitTesting
    case ncclCollGather:        return DefaultPrepData_Gather(collArgs, false);
    case ncclCollScatter:       return DefaultPrepData_Scatter(collArgs);
    case ncclCollAllToAll:      return DefaultPrepData_AllToAll(collArgs);
+    case ncclCollAllToAllv:     return DefaultPrepData_AllToAllv(collArgs);
    case ncclCollSend:          return DefaultPrepData_Send(collArgs);
    case ncclCollRecv:          return DefaultPrepData_Recv(collArgs);
    default:
@@ -64,15 +65,15 @@ namespace RcclUnitTesting
    CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numBytes));

    // Only root needs input pattern
-    if (collArgs.globalRank == collArgs.root)
+    if (collArgs.globalRank == collArgs.options.root)
      CHECK_CALL(collArgs.inputGpu.FillPattern(collArgs.dataType,
                                               collArgs.numInputElements,
-                                               collArgs.root, true));
+                                               collArgs.options.root, true));

    // Otherwise all other ranks expected output is the same as input of root
    return collArgs.expected.FillPattern(collArgs.dataType,
                                         collArgs.numInputElements,
-                                         collArgs.root,
+                                         collArgs.options.root,
                                         false);
  }

@@ -96,11 +97,11 @@ namespace RcclUnitTesting
    CHECK_CALL(result.ClearCpuMem(numBytes));

    // If average or custom reduction operator is used, perform a summation instead
-    ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp);
+    ncclRedOp_t const tempOp = (collArgs.options.redOp >= ncclAvg ? ncclSum : collArgs.options.redOp);

    // Loop over each rank and generate their input into a temp buffer, then reduce
    PtrUnion scalarsPerRank;
-    scalarsPerRank.Attach(collArgs.scalarTransport.ptr);
+    scalarsPerRank.Attach(collArgs.options.scalarTransport.ptr);

    PtrUnion tempInputCpu;
    CHECK_CALL(tempInputCpu.Attach(collArgs.outputCpu));
@@ -117,14 +118,14 @@ namespace RcclUnitTesting

      // Scale the temporary input by local scalar for this rank
      // (Used by custom reduction ops)
-      if (collArgs.scalarMode >= 0)
+      if (collArgs.options.scalarMode >= 0)
      {
        CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements,
                                      scalarsPerRank, rank));
      }

      // Any rank that requires output reduces the scaled-inputs
-      if (isAllReduce || collArgs.root == collArgs.globalRank)
+      if (isAllReduce || collArgs.options.root == collArgs.globalRank)
      {
        if (rank == 0)
        {
@@ -139,7 +140,7 @@ namespace RcclUnitTesting
    }

    // Perform averaging if necessary
-    if (collArgs.redOp == ncclAvg && (isAllReduce || collArgs.root == collArgs.globalRank))
+    if (collArgs.options.redOp == ncclAvg && (isAllReduce || collArgs.options.root == collArgs.globalRank))
    {
      CHECK_CALL(result.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
    }
@@ -176,7 +177,7 @@ namespace RcclUnitTesting
      {
        CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numInputBytes, hipMemcpyHostToDevice));
      }
-      if (isAllGather || collArgs.root == collArgs.globalRank)
+      if (isAllGather || collArgs.options.root == collArgs.globalRank)
      {
        memcpy(result.I1 + (rank * numInputBytes), tempInputCpu.ptr, numInputBytes);
      }
@@ -207,11 +208,11 @@ namespace RcclUnitTesting
    CHECK_CALL(tempResultCpu.ClearCpuMem(numInputBytes));

    // If average or custom reduction operator is used, perform a summation instead
-    ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp);
+    ncclRedOp_t const tempOp = (collArgs.options.redOp >= ncclAvg ? ncclSum : collArgs.options.redOp);

    // Loop over each rank and generate the input / scale / reduce
    PtrUnion scalarsPerRank;
-    scalarsPerRank.Attach(collArgs.scalarTransport.ptr);
+    scalarsPerRank.Attach(collArgs.options.scalarTransport.ptr);
    for (int rank = 0; rank < collArgs.totalRanks; ++rank)
    {
      CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
@@ -229,7 +230,7 @@ namespace RcclUnitTesting

      // Scale the temporary input by local scalar for this rank
      // (Used by custom reduction ops)
-      if (collArgs.scalarMode >= 0)
+      if (collArgs.options.scalarMode >= 0)
      {
        CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements,
                                      scalarsPerRank, rank));
@@ -247,7 +248,7 @@ namespace RcclUnitTesting
    }

    // Perform averaging if necessary
-    if (collArgs.redOp == ncclAvg)
+    if (collArgs.options.redOp == ncclAvg)
    {
      CHECK_CALL(tempResultCpu.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
    }
@@ -279,10 +280,10 @@ namespace RcclUnitTesting
    // Generate input as if on root rank - each rank will receive a portion
    PtrUnion tempInput;
    tempInput.AllocateCpuMem(numInputBytes);
-    tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.root, false);
+    tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.options.root, false);

    // Copy input to root rank
-    if (collArgs.globalRank == collArgs.root)
+    if (collArgs.globalRank == collArgs.options.root)
    {
      if (hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice) != hipSuccess)
      {
@@ -341,6 +342,45 @@ namespace RcclUnitTesting
    return TEST_SUCCESS;
  }

+  ErrCode DefaultPrepData_AllToAllv(CollectiveArgs &collArgs)
+  {
+
+    CHECK_CALL(CheckAllocation(collArgs));
+    size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
+    size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
+
+    // calculating maxNumElements  as the maximum number of input bytes out of all the ranks
+    size_t maxNumElements  = 0;
+    for (int sendRank = 0; sendRank < collArgs.totalRanks; ++sendRank)
+    for (int recvRank = 0; recvRank < collArgs.totalRanks; ++recvRank)
+    {
+      size_t rankSendCount = collArgs.options.sdispls[(sendRank)*collArgs.totalRanks+recvRank] + collArgs.options.sendcounts[(sendRank)*collArgs.totalRanks+recvRank];
+      maxNumElements = std::max(maxNumElements, rankSendCount);
+    }
+
+    // Clear outputs on all ranks (prior to input in case of in-place)
+    collArgs.outputGpu.ClearGpuMem(numOutputBytes);
+
+    // Generate input on root rank - each rank will receive a portion
+    PtrUnion tempInput;
+    tempInput.AllocateCpuMem(maxNumElements*DataTypeToBytes(collArgs.dataType));
+
+    for (int sendRank = 0; sendRank < collArgs.totalRanks; ++sendRank)
+    {
+      tempInput.FillPattern(collArgs.dataType, maxNumElements, sendRank, false);
+      size_t recvDspls = collArgs.options.rdispls[collArgs.globalRank*collArgs.totalRanks + sendRank] * DataTypeToBytes(collArgs.dataType);
+      size_t sendDspls = collArgs.options.sdispls[sendRank*collArgs.totalRanks + collArgs.globalRank] * DataTypeToBytes(collArgs.dataType);
+      size_t numBytes = collArgs.options.recvcounts[collArgs.globalRank*collArgs.totalRanks + sendRank] * DataTypeToBytes(collArgs.dataType);
+      memcpy(collArgs.expected.U1 + recvDspls, tempInput.U1 + sendDspls, numBytes);
+    }
+    tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.globalRank, false);
+
+    CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice));
+
+    tempInput.FreeCpuMem();
+    return TEST_SUCCESS;
+  }
+
  ErrCode DefaultPrepData_Send(CollectiveArgs &collArgs)
  {
    CHECK_CALL(CheckAllocation(collArgs));
@@ -354,7 +394,7 @@ namespace RcclUnitTesting
    CHECK_CALL(CheckAllocation(collArgs));
    return collArgs.expected.FillPattern(collArgs.dataType,
                                         collArgs.numOutputElements,
-                                         collArgs.root,
+                                         collArgs.options.root,
                                         false);
  }
 }
@@ -22,6 +22,7 @@ namespace RcclUnitTesting
  ErrCode DefaultPrepData_ReduceScatter(CollectiveArgs &collArgs);
  ErrCode DefaultPrepData_Scatter(CollectiveArgs &collArgs);
  ErrCode DefaultPrepData_AllToAll(CollectiveArgs &collArgs);
+  ErrCode DefaultPrepData_AllToAllv(CollectiveArgs &collArgs);
  ErrCode DefaultPrepData_Send(CollectiveArgs &collArgs);
  ErrCode DefaultPrepData_Recv(CollectiveArgs &collArgs);
 }
@@ -155,32 +155,19 @@ namespace RcclUnitTesting
    InitComms(TestBed::GetDeviceIdsList(1, numGpus), numCollectivesInGroup);
  }

-  void TestBed::SetCollectiveArgs(ncclFunc_t     const funcType,
-                                  ncclDataType_t const dataType,
-                                  ncclRedOp_t    const redOp,
-                                  int            const root,
-                                  size_t         const numInputElements,
-                                  size_t         const numOutputElements,
-                                  int            const collId,
-                                  int            const rank,
-                                  PtrUnion       const scalarsPerRank,
-                                  int            const scalarMode)
+  void TestBed::SetCollectiveArgs(ncclFunc_t      const funcType,
+                                  ncclDataType_t  const dataType,
+                                  size_t          const numInputElements,
+                                  size_t          const numOutputElements,
+                                  OptionalColArgs const &optionalArgs,
+                                  int             const collId,
+                                  int             const rank)
  {
    // Build list of ranks this applies to (-1 for rank means to set for all)
    std::vector<int> rankList;
    for (int i = 0; i < this->numActiveRanks; ++i)
      if (rank == -1 || rank == i) rankList.push_back(i);

-    ScalarTransport scalarTransport;
-    if (scalarMode >= 0)
-    {
-      ASSERT_TRUE(scalarsPerRank.ptr != NULL);
-
-      // Capture scalars per rank in format to share with child processes
-      int const numBytes = this->numActiveRanks * DataTypeToBytes(dataType);
-      memcpy(scalarTransport.ptr, scalarsPerRank.ptr, numBytes);
-    }
-
    // Loop over all ranks and send CollectiveArgs to appropriate child process
    int const cmd = TestBedChild::CHILD_SET_COLL_ARGS;
    for (auto currRank : rankList)
@@ -191,12 +178,9 @@ namespace RcclUnitTesting
      PIPE_WRITE(childId, collId);
      PIPE_WRITE(childId, funcType);
      PIPE_WRITE(childId, dataType);
-      PIPE_WRITE(childId, redOp);
-      PIPE_WRITE(childId, root);
      PIPE_WRITE(childId, numInputElements);
      PIPE_WRITE(childId, numOutputElements);
-      PIPE_WRITE(childId, scalarMode);
-      PIPE_WRITE(childId, scalarTransport);
+      PIPE_WRITE(childId, optionalArgs);
      PIPE_CHECK(childId);
    }
  }
@@ -412,7 +396,7 @@ namespace RcclUnitTesting
    // Sort numElements in descending order to cut down on # of allocations
    std::vector<int> sortedN = numElements;
    std::sort(sortedN.rbegin(), sortedN.rend());
-
+    OptionalColArgs optionalArgs;
    // Filter out any unsupported datatypes, in case only subset has been compiled for
    std::vector<ncclDataType_t> const& supportedDataTypes = this->GetAllSupportedDataTypes();
    std::vector<ncclDataType_t> dataTypes;
@@ -479,13 +463,13 @@ namespace RcclUnitTesting
                                                    totalRanks,
                                                    &numInputElements,
                                                    &numOutputElements);
-
+          optionalArgs.redOp = redOps[rdIdx];
+          optionalArgs.root = roots[rtIdx];
          this->SetCollectiveArgs(funcTypes[ftIdx],
                                  dataTypes[dtIdx],
-                                  redOps[rdIdx],
-                                  roots[rtIdx],
                                  numInputElements,
-                                  numOutputElements);
+                                  numOutputElements,
+                                  optionalArgs);

          // Only allocate once for largest size
          if (neIdx == 0) this->AllocateMem(inPlaceList[ipIdx], managedMemList[mmIdx]);
@@ -41,16 +41,13 @@ namespace RcclUnitTesting
    // Using collId = -1 (default) applies settings to all collectives in group
    // Using rank = -1 (default) applies settings to all ranks

-    void SetCollectiveArgs(ncclFunc_t     const funcType,
-                           ncclDataType_t const dataType,
-                           ncclRedOp_t    const redOp,
-                           int            const root,
-                           size_t         const numInputElements,
-                           size_t         const numOutputElements,
-                           int            const collId = -1,
-                           int            const rank = -1,
-                           PtrUnion       const scalarsPerRank = {nullptr},
-                           int            const scalarMode = -1);
+    void SetCollectiveArgs(ncclFunc_t      const funcType,
+                           ncclDataType_t  const dataType,
+                           size_t          const numInputElements,
+                           size_t          const numOutputElements,
+                           OptionalColArgs const &optionalArgs = {},
+                           int             const collId = -1,
+                           int             const rank = -1);

    // Allocate memory for specified collective / rank
    // - Requires SetCollectiveArgs to have been called already
@@ -188,29 +188,17 @@ namespace RcclUnitTesting
    int             collId;
    ncclFunc_t      funcType;
    ncclDataType_t  dataType;
-    ncclRedOp_t     redOp;
-    int             root;
    size_t          numInputElements;
    size_t          numOutputElements;
-    ScalarTransport scalarTransport;
-    int             scalarMode;
+    OptionalColArgs options;

    PIPE_READ(globalRank);
    PIPE_READ(collId);
    PIPE_READ(funcType);
    PIPE_READ(dataType);
-    PIPE_READ(redOp);
-    PIPE_READ(root);
    PIPE_READ(numInputElements);
    PIPE_READ(numOutputElements);
-    PIPE_READ(scalarMode);
-    PIPE_READ(scalarTransport);
-
-    for (int i = 0; i < this->totalRanks; i++)
-    {
-      PtrUnion scalarsPerRank;
-      scalarsPerRank.Attach(scalarTransport.ptr);
-    }
+    PIPE_READ(options);

    if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
    {
@@ -227,24 +215,24 @@ namespace RcclUnitTesting
        CollectiveArgs& collArg = this->collArgs[localRank][collIdx];
        CHECK_CALL(collArg.SetArgs(globalRank, this->totalRanks,
                                   this->deviceIds[localRank],
-                                   funcType, dataType, redOp, root,
+                                   funcType, dataType,
                                   numInputElements, numOutputElements,
-                                   scalarTransport, scalarMode));
+                                   options));
        if (this->verbose) INFO("Rank %d on child %d sets collective %d [%s]\n",
                                globalRank, this->childId, collIdx,
                                collArg.GetDescription().c_str());

        // If pre-mult scalars are provided, then create a custom reduction operator
-        if (scalarMode >= 0)
+        if (options.scalarMode >= 0)
        {
-          CHILD_NCCL_CALL(ncclRedOpCreatePreMulSum(&collArg.redOp,
+          CHILD_NCCL_CALL(ncclRedOpCreatePreMulSum(&collArg.options.redOp,
                                                   collArg.localScalar.ptr,
                                                   dataType,
-                                                   (ncclScalarResidence_t)scalarMode,
+                                                   (ncclScalarResidence_t)options.scalarMode,
                                                   this->comms[localRank]),
                          "ncclRedOpCreatePreMulSum");
          if (verbose) INFO("Child %d created custom redop %d for collective %d\n",
-                            this->childId, collArg.redOp, collIdx);
+                            this->childId, collArg.options.redOp, collIdx);
        }
      }
    }
@@ -383,7 +371,7 @@ namespace RcclUnitTesting
                                        collArg.outputGpu.ptr,
                                        collArg.numInputElements,
                                        collArg.dataType,
-                                        collArg.root,
+                                        collArg.options.root,
                                        this->comms[localRank],
                                        this->streams[localRank]),
                          "ncclBroadcast");
@@ -393,8 +381,8 @@ namespace RcclUnitTesting
                                     collArg.outputGpu.ptr,
                                     collArg.numInputElements,
                                     collArg.dataType,
-                                     collArg.redOp,
-                                     collArg.root,
+                                     collArg.options.redOp,
+                                     collArg.options.root,
                                     this->comms[localRank],
                                     this->streams[localRank]),
                          "ncclReduce");
@@ -413,7 +401,7 @@ namespace RcclUnitTesting
                                            collArg.outputGpu.ptr,
                                            collArg.numOutputElements,
                                            collArg.dataType,
-                                            collArg.redOp,
+                                            collArg.options.redOp,
                                            this->comms[localRank],
                                            this->streams[localRank]),
                          "ncclReduceScatter");
@@ -423,7 +411,7 @@ namespace RcclUnitTesting
                                        collArg.outputGpu.ptr,
                                        collArg.numInputElements,
                                        collArg.dataType,
-                                        collArg.redOp,
+                                        collArg.options.redOp,
                                        this->comms[localRank],
                                        this->streams[localRank]),
                          "ncclAllReduce");
@@ -433,7 +421,7 @@ namespace RcclUnitTesting
                                     collArg.outputGpu.ptr,
                                     collArg.numInputElements,
                                     collArg.dataType,
-                                     collArg.root,
+                                     collArg.options.root,
                                     this->comms[localRank],
                                     this->streams[localRank]),
                          "ncclGather");
@@ -443,7 +431,7 @@ namespace RcclUnitTesting
                                      collArg.outputGpu.ptr,
                                      collArg.numOutputElements,
                                      collArg.dataType,
-                                      collArg.root,
+                                      collArg.options.root,
                                      this->comms[localRank],
                                      this->streams[localRank]),
                          "ncclScatter");
@@ -457,11 +445,23 @@ namespace RcclUnitTesting
                                       this->streams[localRank]),
                          "ncclAllToAll");
          break;
+        case ncclCollAllToAllv:
+          CHILD_NCCL_CALL(ncclAllToAllv(collArg.inputGpu.ptr,
+                                        collArg.options.sendcounts + (this->rankOffset + localRank)*this->totalRanks,
+                                        collArg.options.sdispls + (this->rankOffset + localRank)*this->totalRanks,
+                                        collArg.outputGpu.ptr,
+                                        collArg.options.recvcounts + (this->rankOffset + localRank)*this->totalRanks,
+                                        collArg.options.rdispls + (this->rankOffset + localRank)*this->totalRanks,
+                                        collArg.dataType,
+                                        this->comms[localRank],
+                                        this->streams[localRank]),
+                          "ncclAllToAllv");
+          break;
        case ncclCollSend:
          CHILD_NCCL_CALL(ncclSend(collArg.inputGpu.ptr,
                                   collArg.numInputElements,
                                   collArg.dataType,
-                                   collArg.root,
+                                   collArg.options.root,
                                   this->comms[localRank],
                                   this->streams[localRank]),
                          "ncclSend");
@@ -470,7 +470,7 @@ namespace RcclUnitTesting
          CHILD_NCCL_CALL(ncclRecv(collArg.outputGpu.ptr,
                                   collArg.numOutputElements,
                                   collArg.dataType,
-                                   collArg.root,
+                                   collArg.options.root,
                                   this->comms[localRank],
                                   this->streams[localRank]),
                          "ncclRecv");
@@ -579,12 +579,12 @@ namespace RcclUnitTesting

        CHECK_CALL(collArg.DeallocateMem());
      }
-      if (collArg.scalarMode != -1)
+      if (collArg.options.scalarMode != -1)
      {
-        CHILD_NCCL_CALL(ncclRedOpDestroy(collArg.redOp, this->comms[localRank]),
+        CHILD_NCCL_CALL(ncclRedOpDestroy(collArg.options.redOp, this->comms[localRank]),
                        "ncclRedOpDestroy");
        if (verbose) INFO("Child %d destroys custom redop %d for collective %d\n",
-                          this->childId, collArg.redOp, collIdx);
+                          this->childId, collArg.options.redOp, collIdx);
      }
    }
    if (this->verbose) INFO("Child %d finishes DeallocateMem\n", this->childId);