diff --git a/CMakeLists.txt b/CMakeLists.txt
index 842caef63c..c64c96c2d2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,7 +48,7 @@ option(BUILD_TESTS "Build test programs" OFF)
 option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
 option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF)
 option(BUILD_ALLREDUCE_ONLY "Build AllReduce + sum + float kernel only" OFF)
-#Set the header wrapper ON by default. 
+#Set the header wrapper ON by default.
 option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" ON)
 
 # parse version from Makefile NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH must exist
@@ -292,7 +292,7 @@ if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY)
             GUARDS SYMLINK WRAPPER
 	    WRAPPER_LOCATIONS rccl )
   #install the wrapper header file to package
-  rocm_install( FILES "${PROJECT_BINARY_DIR}/rccl/include/rccl.h" 
+  rocm_install( FILES "${PROJECT_BINARY_DIR}/rccl/include/rccl.h"
 	        DESTINATION "./rccl/include/" )
 endif()
 
diff --git a/test/AllReduce_GroupCall.cpp b/test/AllReduce_GroupCall.cpp
index 404935eb71..5d10a56c83 100644
--- a/test/AllReduce_GroupCall.cpp
+++ b/test/AllReduce_GroupCall.cpp
@@ -16,11 +16,11 @@ namespace RcclUnitTesting
     std::vector<ncclDataType_t> const& dataTypes       = {ncclFloat};
     std::vector<ncclRedOp_t>    const& redOps          = {ncclSum};
     std::vector<int>            const  numElements     = {1048576, 53327, 1024};
-    int                         const  root            = 0;
     bool                        const  inPlace         = false;
     bool                        const  useManagedMem   = false;
     int                         const  numCollPerGroup = numElements.size();
 
+    OptionalColArgs options;
     // This tests runs 3 collectives in the same group call
     bool isCorrect = true;
     for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
@@ -33,30 +33,32 @@ namespace RcclUnitTesting
       testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
 
       for (int redOpIdx = 0; redOpIdx < redOps.size() && isCorrect; ++redOpIdx)
-      for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
       {
-        if (testBed.ev.showNames)
-          INFO("%s %d-ranks AllReduce %d Grouped Calls (%s-%s)\n",
-               isMultiProcess ? "MP" : "SP",
-               totalRanks, numCollPerGroup,
-               ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]);
-
-        // Run all element sizes in parallel as single group
-        for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx)
+        options.redOp = redOps[redOpIdx];
+        for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
         {
-          testBed.SetCollectiveArgs(funcType,
-                                    dataTypes[dataIdx],
-                                    redOps[redOpIdx],
-                                    root,
-                                    numElements[collIdx],
-                                    numElements[collIdx],
-                                    collIdx);
+          if (testBed.ev.showNames)
+            INFO("%s %d-ranks AllReduce %d Grouped Calls (%s-%s)\n",
+                 isMultiProcess ? "MP" : "SP",
+                 totalRanks, numCollPerGroup,
+                 ncclRedOpNames[redOps[redOpIdx]], ncclDataTypeNames[dataTypes[dataIdx]]);
+
+          // Run all element sizes in parallel as single group
+          for (int collIdx = 0; collIdx < numCollPerGroup; ++collIdx)
+          {
+            testBed.SetCollectiveArgs(funcType,
+                                      dataTypes[dataIdx],
+                                      numElements[collIdx],
+                                      numElements[collIdx],
+                                      options,
+                                      collIdx);
+          }
+          testBed.AllocateMem(inPlace, useManagedMem);
+          testBed.PrepareData();
+          testBed.ExecuteCollectives();
+          testBed.ValidateResults(isCorrect);
+          testBed.DeallocateMem();
         }
-        testBed.AllocateMem(inPlace, useManagedMem);
-        testBed.PrepareData();
-        testBed.ExecuteCollectives();
-        testBed.ValidateResults(isCorrect);
-        testBed.DeallocateMem();
       }
       testBed.DestroyComms();
     }
diff --git a/test/AllReduce_PreMultScalar.cpp b/test/AllReduce_PreMultScalar.cpp
index 37d8a064b7..9be19000ad 100644
--- a/test/AllReduce_PreMultScalar.cpp
+++ b/test/AllReduce_PreMultScalar.cpp
@@ -17,10 +17,10 @@ namespace RcclUnitTesting
     std::vector<ncclDataType_t> const& dataTypes     = {ncclInt32, ncclFloat32, ncclFloat64};
     ncclRedOp_t                 const  redOp         = ncclSum;
     std::vector<int>            const  numElements   = {1048576, 1024};
-    int                         const  root          = 0;
     bool                        const  inPlace       = false;
     bool                        const  useManagedMem = false;
 
+    OptionalColArgs options;
     // Terminate the test as soon as first failure occurs
     bool isCorrect = true;
     for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
@@ -43,6 +43,8 @@ namespace RcclUnitTesting
           double F = i;
           scalarsPerRank.Set(dataType, i, i, F);
         }
+        int const numBytes = totalRanks * DataTypeToBytes(dataType);
+        memcpy(options.scalarTransport.ptr, scalarsPerRank.ptr, numBytes);
 
         // Test various scalar residence modes
         for (int scalarMode = 0; scalarMode <= 1 && isCorrect; ++scalarMode)
@@ -54,9 +56,11 @@ namespace RcclUnitTesting
 
           for (int i = 0; i < numElements.size() && isCorrect; ++i)
           {
-            testBed.SetCollectiveArgs(funcType, dataType, redOp, root,
+            options.scalarMode = scalarMode;
+            options.redOp = redOp;
+            testBed.SetCollectiveArgs(funcType, dataType,
                                       numElements[i], numElements[i],
-                                      -1, -1, scalarsPerRank, scalarMode);
+                                      options);
             // For performance, only allocate and prepare data on largest size
             if (i == 0)
             {
diff --git a/test/AllToAllv_OutOfPlace.cpp b/test/AllToAllv_OutOfPlace.cpp
new file mode 100644
index 0000000000..083f3353cd
--- /dev/null
+++ b/test/AllToAllv_OutOfPlace.cpp
@@ -0,0 +1,96 @@
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "TestBed.hpp"
+
+namespace RcclUnitTesting
+{
+
+  void sendRecvPrep(size_t numInputElementsArray[],
+                    size_t numOutputElementsArray[],
+                    OptionalColArgs &options,
+                    int totalRanks, int numElementsBase)
+  {
+    for (int sendRank = 0; sendRank < totalRanks; ++sendRank)
+    for (int recvRank  = 0; recvRank   < totalRanks; ++recvRank )
+    {
+      //create send counts, and build other arrays from that
+      options.sendcounts[sendRank*totalRanks+recvRank] = numElementsBase * (recvRank + 1);
+      options.recvcounts[recvRank*totalRanks+sendRank] = options.sendcounts[sendRank*totalRanks+recvRank ];
+    }
+
+    for (int sendRank = 0; sendRank < totalRanks; ++sendRank)
+    {
+      options.sdispls[sendRank*totalRanks] = 0;
+      options.rdispls[sendRank*totalRanks] = 0;
+      for (int recvRank = 1; recvRank < totalRanks; ++recvRank )
+      {
+        options.sdispls[sendRank*totalRanks+recvRank] =
+                        options.sdispls[sendRank*totalRanks+recvRank-1] + options.sendcounts[sendRank*totalRanks+recvRank-1];
+        options.rdispls[sendRank*totalRanks+recvRank] =
+                        options.rdispls[sendRank*totalRanks+recvRank-1] + options.recvcounts[sendRank*totalRanks+recvRank-1];
+      }
+      numInputElementsArray[sendRank] = options.sdispls[(sendRank+1)*totalRanks-1] + options.sendcounts[(sendRank+1)*totalRanks-1];
+      numOutputElementsArray[sendRank] = options.rdispls[(sendRank+1)*totalRanks-1] + options.recvcounts[(sendRank+1)*totalRanks-1];
+    }
+
+  }
+
+  TEST(AllToAllv, OutOfPlace)
+  {
+    TestBed testBed;
+    // Configuration
+    std::vector<ncclDataType_t> const& dataTypes       = {ncclInt32, ncclFloat64};
+    std::vector<int>            const  numElementsBase = {1048576, 53327, 1024};
+    bool                        const  inPlace         = false;
+    bool                        const  useManagedMem   = false;
+
+    OptionalColArgs options;
+    size_t numInputElementsArray[MAX_RANKS], numOutputElementsArray[MAX_RANKS];
+    bool isCorrect = true;
+    for (int totalRanks = testBed.ev.minGpus; totalRanks <= testBed.ev.maxGpus && isCorrect; ++totalRanks)
+    for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
+    {
+      if (!(testBed.ev.processMask & (1 << isMultiProcess))) continue;
+
+      int const numProcesses = isMultiProcess ? totalRanks : 1;
+      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
+
+      for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
+      for (int numIdx = 0; numIdx < numElementsBase.size() && isCorrect; ++numIdx)
+      {
+        if (testBed.ev.showNames)
+        {
+          std::string name = testBed.GetTestCaseName(totalRanks, isMultiProcess,
+                                                     ncclCollAllToAllv, dataTypes[dataIdx],
+                                                     ncclSum, -1,
+                                                     inPlace, useManagedMem);
+          INFO("%s\n", name.c_str());
+
+        }
+        sendRecvPrep(numInputElementsArray, numOutputElementsArray, options, totalRanks, numElementsBase[numIdx]);
+        for (int rank = 0; rank < totalRanks; ++rank)
+        {
+          testBed.SetCollectiveArgs(ncclCollAllToAllv,
+                                  dataTypes[dataIdx],
+                                  numInputElementsArray[rank],
+                                  numOutputElementsArray[rank],
+                                  options,
+                                  -1,
+                                  rank);
+
+        }
+        testBed.AllocateMem(inPlace, useManagedMem);
+        testBed.PrepareData();
+        testBed.ExecuteCollectives();
+        testBed.ValidateResults(isCorrect);
+        testBed.DeallocateMem();
+
+      }
+      testBed.DestroyComms();
+    }
+    testBed.Finalize();
+  }
+}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 75c892a854..a40b610c03 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -65,6 +65,8 @@ if(BUILD_TESTS)
       #AllToAll
       AllToAll_OutOfPlace.cpp
       AllToAll_ManagedMem.cpp
+      #AllToAllv
+      AllToAllv_OutOfPlace.cpp
       #Broadcast
       Broadcast_InPlace.cpp
       Broadcast_ManagedMem.cpp
diff --git a/test/SendRecv_SinglePairs.cpp b/test/SendRecv_SinglePairs.cpp
index e23a64f754..1911fcf6dd 100644
--- a/test/SendRecv_SinglePairs.cpp
+++ b/test/SendRecv_SinglePairs.cpp
@@ -17,6 +17,7 @@ namespace RcclUnitTesting
     bool                        const  inPlace         = false;
     bool                        const  useManagedMem   = false;
 
+    OptionalColArgs options;
     bool isCorrect = true;
     int totalRanks = testBed.ev.maxGpus;
     for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
@@ -32,12 +33,12 @@ namespace RcclUnitTesting
       {
         for (int recvRank = 0; recvRank  < totalRanks; ++recvRank)
         {
+          options.root = recvRank;
           testBed.SetCollectiveArgs(ncclCollSend,
                                     dataTypes[dataIdx],
-                                    ncclSum,
-                                    recvRank,
                                     numElements[numIdx],
                                     numElements[numIdx],
+                                    options,
                                     0,
                                     sendRank);
           if (recvRank == 0)
@@ -55,18 +56,17 @@ namespace RcclUnitTesting
                   recvRank,
                   numElements[numIdx]);
 
-
+            options.root = sendRank;
             testBed.SetCollectiveArgs(ncclCollRecv,
                                       dataTypes[dataIdx],
-                                      ncclSum,
-                                      sendRank,
                                       numElements[numIdx],
                                       numElements[numIdx],
+                                      options,
                                       0,
                                       recvRank);
             testBed.AllocateMem(inPlace, useManagedMem, 0, recvRank);
             testBed.PrepareData(0, recvRank);
-            testBed.ExecuteCollectives({sendRank,recvRank });
+            testBed.ExecuteCollectives({sendRank, recvRank});
             testBed.ValidateResults(isCorrect, 0, recvRank);
             testBed.DeallocateMem(0, recvRank);
           }
diff --git a/test/common/CollectiveArgs.cpp b/test/common/CollectiveArgs.cpp
index 976ce1bca8..08beed737d 100644
--- a/test/common/CollectiveArgs.cpp
+++ b/test/common/CollectiveArgs.cpp
@@ -14,20 +14,17 @@ namespace RcclUnitTesting
                                   int             const  deviceId,
                                   ncclFunc_t      const  funcType,
                                   ncclDataType_t  const  dataType,
-                                  ncclRedOp_t     const  redOp,
-                                  int             const  root,
                                   size_t          const  numInputElements,
                                   size_t          const  numOutputElements,
-                                  ScalarTransport const  scalarTransport,
-                                  int             const  scalarMode)
+                                  OptionalColArgs const  &optionalColArgs)
   {
     // Free scalar based on previous scalarMode
-    if (scalarMode != -1)
+    if (optionalColArgs.scalarMode != -1)
     {
       if (this->localScalar.ptr != nullptr)
       {
-        if (this->scalarMode == 0) this->localScalar.FreeGpuMem();
-        if (this->scalarMode == 1) hipHostFree(this->localScalar.ptr);
+        if (this->options.scalarMode == 0) this->localScalar.FreeGpuMem();
+        if (this->options.scalarMode == 1) hipHostFree(this->localScalar.ptr);
       }
     }
 
@@ -36,26 +33,23 @@ namespace RcclUnitTesting
     this->deviceId          = deviceId;
     this->funcType          = funcType;
     this->dataType          = dataType;
-    this->redOp             = redOp;
-    this->root              = root;
     this->numInputElements  = numInputElements;
     this->numOutputElements = numOutputElements;
-    this->scalarTransport   = scalarTransport;
-    this->scalarMode        = scalarMode;
+    this->options           = optionalColArgs;
 
-    if (scalarMode != -1)
+    if (this->options.scalarMode != -1)
     {
       size_t const numBytes = DataTypeToBytes(dataType);
-      if (scalarMode == ncclScalarDevice)
+      if (this->options.scalarMode == ncclScalarDevice)
       {
         CHECK_CALL(this->localScalar.AllocateGpuMem(numBytes));
-        CHECK_HIP(hipMemcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes),
+        CHECK_HIP(hipMemcpy(this->localScalar.ptr, optionalColArgs.scalarTransport.ptr + (globalRank * numBytes),
                             numBytes, hipMemcpyHostToDevice));
       }
-      else if (scalarMode == ncclScalarHostImmediate)
+      else if (this->options.scalarMode == ncclScalarHostImmediate)
       {
         CHECK_HIP(hipHostMalloc(&this->localScalar.ptr, numBytes, 0));
-        memcpy(this->localScalar.ptr, scalarTransport.ptr + (globalRank * numBytes), numBytes);
+        memcpy(this->localScalar.ptr, optionalColArgs.scalarTransport.ptr + (globalRank * numBytes), numBytes);
       }
     }
     return TEST_SUCCESS;
@@ -116,8 +110,8 @@ namespace RcclUnitTesting
   ErrCode CollectiveArgs::ValidateResults()
   {
     // Ignore non-root outputs for collectives with a root
-    if (CollectiveArgs::UsesRoot(this->funcType) && this->root != this->globalRank) return TEST_SUCCESS;
-
+    if (CollectiveArgs::UsesRoot(this->funcType) && this->options.root != this->globalRank) return TEST_SUCCESS;
+    if (this->funcType == ncclCollSend) return TEST_SUCCESS; // on the send receive pair only recv needs to be checked
     size_t const numOutputBytes = (this->numOutputElements * DataTypeToBytes(this->dataType));
 
     CHECK_HIP(hipMemcpy(this->outputCpu.ptr, this->outputGpu.ptr, numOutputBytes, hipMemcpyDeviceToHost));
@@ -153,8 +147,9 @@ namespace RcclUnitTesting
 
     if (this->localScalar.ptr != nullptr)
     {
-      if (this->scalarMode == 0) this->localScalar.FreeGpuMem();
-      if (this->scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr));
+      if (this->options.scalarMode == 0) this->localScalar.FreeGpuMem();
+      if (this->options.scalarMode == 1) CHECK_HIP(hipHostFree(this->localScalar.ptr));
+      this->localScalar.Attach(nullptr);
     }
     return TEST_SUCCESS;
   }
@@ -174,6 +169,7 @@ namespace RcclUnitTesting
     case ncclCollGather:        ss << "ncclGather";        break;
     case ncclCollScatter:       ss << "ncclScatter";       break;
     case ncclCollAllToAll:      ss << "ncclAllToAll";      break;
+    case ncclCollAllToAllv:     ss << "ncclAllToAllv";     break;
     case ncclCollSend:          ss << "ncclSend";          break;
     case ncclCollRecv:          ss << "ncclRecv";          break;
     default:                    ss << "[Unknown]";         break;
@@ -184,9 +180,9 @@ namespace RcclUnitTesting
         this->funcType == ncclCollReduceScatter ||
         this->funcType == ncclCollAllReduce)
     {
-      if (this->redOp < ncclNumOps)
+      if (this->options.redOp < ncclNumOps)
       {
-        ss << ncclRedOpNames[this->redOp] << " ";
+        ss << ncclRedOpNames[this->options.redOp] << " ";
       }
       else
       {
@@ -215,13 +211,13 @@ namespace RcclUnitTesting
         this->funcType == ncclCollGather ||
         this->funcType == ncclCollScatter)
     {
-      ss << "Root " << this->root << " ";
+      ss << "Root " << this->options.root << " ";
     }
 
     if (this->funcType == ncclCollSend ||
         this->funcType == ncclCollRecv)
     {
-      ss << "Peer " << this->root << " ";
+      ss << "Peer " << this->options.root << " ";
     }
 
     ss << "#In: " << this->numInputElements;
@@ -277,7 +273,6 @@ namespace RcclUnitTesting
     return (funcType == ncclCollBroadcast ||
             funcType == ncclCollReduce    ||
             funcType == ncclCollGather    ||
-            funcType == ncclCollScatter   ||
-            funcType == ncclCollSend); // this is incorrect but it works because in Send root is not root it is the peer
+            funcType == ncclCollScatter);
   }
 }
diff --git a/test/common/CollectiveArgs.hpp b/test/common/CollectiveArgs.hpp
index e7d294bc81..9caf2ab33f 100644
--- a/test/common/CollectiveArgs.hpp
+++ b/test/common/CollectiveArgs.hpp
@@ -22,6 +22,7 @@ namespace RcclUnitTesting
     ncclCollGather,
     ncclCollScatter,
     ncclCollAllToAll,
+    ncclCollAllToAllv,
     ncclCollSend,
     ncclCollRecv,
     ncclNumFuncs
@@ -37,6 +38,7 @@ namespace RcclUnitTesting
     "Gather",
     "Scatter",
     "AllToAll",
+    "AllToAllv",
     "Send",
     "Recv"
   };
@@ -72,6 +74,19 @@ namespace RcclUnitTesting
     char ptr[MAX_RANKS * sizeof(double)];
   };
 
+  struct OptionalColArgs
+  {
+    ncclRedOp_t     redOp = ncclSum;
+    int             root = 0; // Used as "peer" for Send/Recv
+    ScalarTransport scalarTransport;   // Used for custom reduction operators
+    int             scalarMode = -1; // -1 if scalar not used
+    // allToAllv args
+    size_t          sendcounts[MAX_RANKS*MAX_RANKS];
+    size_t          sdispls[MAX_RANKS*MAX_RANKS];
+    size_t          recvcounts[MAX_RANKS*MAX_RANKS];
+    size_t          rdispls[MAX_RANKS*MAX_RANKS];
+  };
+
   // Function pointer for functions that operate on CollectiveArgs
   // e.g. For filling input / computing expected results
   typedef ErrCode (*CollFuncPtr)(CollectiveArgs &);
@@ -85,13 +100,10 @@ namespace RcclUnitTesting
     int             deviceId;
     ncclFunc_t      funcType;
     ncclDataType_t  dataType;
-    ncclRedOp_t     redOp;
-    int             root;              // Used as "peer" for Send/Recv
     size_t          numInputElements;
     size_t          numOutputElements;
-    ScalarTransport scalarTransport;   // Used for custom reduction operators
     PtrUnion        localScalar;
-    int             scalarMode;        // -1 if scalar not used
+    OptionalColArgs options;
 
     // Data
     PtrUnion       inputGpu;
@@ -111,12 +123,9 @@ namespace RcclUnitTesting
                     int             const deviceId,
                     ncclFunc_t      const funcType,
                     ncclDataType_t  const dataType,
-                    ncclRedOp_t     const redOp,
-                    int             const root,
                     size_t          const numInputElements,
                     size_t          const numOutputElements,
-                    ScalarTransport const scalarsPerRank,
-                    int             const scalarMode = -1);
+                    OptionalColArgs const &optionalArgs = {});
 
     // Allocates GPU memory for input/output and CPU memory for expected
     // When inPlace is true, input and output share the same memory
diff --git a/test/common/PrepDataFuncs.cpp b/test/common/PrepDataFuncs.cpp
index 4f837d76c7..cde4d73c34 100644
--- a/test/common/PrepDataFuncs.cpp
+++ b/test/common/PrepDataFuncs.cpp
@@ -23,6 +23,7 @@ namespace RcclUnitTesting
     case ncclCollGather:        return DefaultPrepData_Gather(collArgs, false);
     case ncclCollScatter:       return DefaultPrepData_Scatter(collArgs);
     case ncclCollAllToAll:      return DefaultPrepData_AllToAll(collArgs);
+    case ncclCollAllToAllv:     return DefaultPrepData_AllToAllv(collArgs);
     case ncclCollSend:          return DefaultPrepData_Send(collArgs);
     case ncclCollRecv:          return DefaultPrepData_Recv(collArgs);
     default:
@@ -64,15 +65,15 @@ namespace RcclUnitTesting
     CHECK_CALL(collArgs.outputGpu.ClearGpuMem(numBytes));
 
     // Only root needs input pattern
-    if (collArgs.globalRank == collArgs.root)
+    if (collArgs.globalRank == collArgs.options.root)
       CHECK_CALL(collArgs.inputGpu.FillPattern(collArgs.dataType,
                                                collArgs.numInputElements,
-                                               collArgs.root, true));
+                                               collArgs.options.root, true));
 
     // Otherwise all other ranks expected output is the same as input of root
     return collArgs.expected.FillPattern(collArgs.dataType,
                                          collArgs.numInputElements,
-                                         collArgs.root,
+                                         collArgs.options.root,
                                          false);
   }
 
@@ -96,11 +97,11 @@ namespace RcclUnitTesting
     CHECK_CALL(result.ClearCpuMem(numBytes));
 
     // If average or custom reduction operator is used, perform a summation instead
-    ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp);
+    ncclRedOp_t const tempOp = (collArgs.options.redOp >= ncclAvg ? ncclSum : collArgs.options.redOp);
 
     // Loop over each rank and generate their input into a temp buffer, then reduce
     PtrUnion scalarsPerRank;
-    scalarsPerRank.Attach(collArgs.scalarTransport.ptr);
+    scalarsPerRank.Attach(collArgs.options.scalarTransport.ptr);
 
     PtrUnion tempInputCpu;
     CHECK_CALL(tempInputCpu.Attach(collArgs.outputCpu));
@@ -117,14 +118,14 @@ namespace RcclUnitTesting
 
       // Scale the temporary input by local scalar for this rank
       // (Used by custom reduction ops)
-      if (collArgs.scalarMode >= 0)
+      if (collArgs.options.scalarMode >= 0)
       {
         CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements,
                                       scalarsPerRank, rank));
       }
 
       // Any rank that requires output reduces the scaled-inputs
-      if (isAllReduce || collArgs.root == collArgs.globalRank)
+      if (isAllReduce || collArgs.options.root == collArgs.globalRank)
       {
         if (rank == 0)
         {
@@ -139,7 +140,7 @@ namespace RcclUnitTesting
     }
 
     // Perform averaging if necessary
-    if (collArgs.redOp == ncclAvg && (isAllReduce || collArgs.root == collArgs.globalRank))
+    if (collArgs.options.redOp == ncclAvg && (isAllReduce || collArgs.options.root == collArgs.globalRank))
     {
       CHECK_CALL(result.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
     }
@@ -176,7 +177,7 @@ namespace RcclUnitTesting
       {
         CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInputCpu.ptr, numInputBytes, hipMemcpyHostToDevice));
       }
-      if (isAllGather || collArgs.root == collArgs.globalRank)
+      if (isAllGather || collArgs.options.root == collArgs.globalRank)
       {
         memcpy(result.I1 + (rank * numInputBytes), tempInputCpu.ptr, numInputBytes);
       }
@@ -207,11 +208,11 @@ namespace RcclUnitTesting
     CHECK_CALL(tempResultCpu.ClearCpuMem(numInputBytes));
 
     // If average or custom reduction operator is used, perform a summation instead
-    ncclRedOp_t const tempOp = (collArgs.redOp >= ncclAvg ? ncclSum : collArgs.redOp);
+    ncclRedOp_t const tempOp = (collArgs.options.redOp >= ncclAvg ? ncclSum : collArgs.options.redOp);
 
     // Loop over each rank and generate the input / scale / reduce
     PtrUnion scalarsPerRank;
-    scalarsPerRank.Attach(collArgs.scalarTransport.ptr);
+    scalarsPerRank.Attach(collArgs.options.scalarTransport.ptr);
     for (int rank = 0; rank < collArgs.totalRanks; ++rank)
     {
       CHECK_CALL(tempInputCpu.FillPattern(collArgs.dataType, collArgs.numInputElements, rank, false));
@@ -229,7 +230,7 @@ namespace RcclUnitTesting
 
       // Scale the temporary input by local scalar for this rank
       // (Used by custom reduction ops)
-      if (collArgs.scalarMode >= 0)
+      if (collArgs.options.scalarMode >= 0)
       {
         CHECK_CALL(tempInputCpu.Scale(collArgs.dataType, collArgs.numInputElements,
                                       scalarsPerRank, rank));
@@ -247,7 +248,7 @@ namespace RcclUnitTesting
     }
 
     // Perform averaging if necessary
-    if (collArgs.redOp == ncclAvg)
+    if (collArgs.options.redOp == ncclAvg)
     {
       CHECK_CALL(tempResultCpu.DivideByInt(collArgs.dataType, collArgs.numInputElements, collArgs.totalRanks));
     }
@@ -279,10 +280,10 @@ namespace RcclUnitTesting
     // Generate input as if on root rank - each rank will receive a portion
     PtrUnion tempInput;
     tempInput.AllocateCpuMem(numInputBytes);
-    tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.root, false);
+    tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.options.root, false);
 
     // Copy input to root rank
-    if (collArgs.globalRank == collArgs.root)
+    if (collArgs.globalRank == collArgs.options.root)
     {
       if (hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice) != hipSuccess)
       {
@@ -341,6 +342,45 @@ namespace RcclUnitTesting
     return TEST_SUCCESS;
   }
 
+  ErrCode DefaultPrepData_AllToAllv(CollectiveArgs &collArgs)
+  {
+
+    CHECK_CALL(CheckAllocation(collArgs));
+    size_t const numInputBytes = collArgs.numInputElements * DataTypeToBytes(collArgs.dataType);
+    size_t const numOutputBytes = collArgs.numOutputElements * DataTypeToBytes(collArgs.dataType);
+
+    // calculating maxNumElements  as the maximum number of input bytes out of all the ranks
+    size_t maxNumElements  = 0;
+    for (int sendRank = 0; sendRank < collArgs.totalRanks; ++sendRank)
+    for (int recvRank = 0; recvRank < collArgs.totalRanks; ++recvRank)
+    {
+      size_t rankSendCount = collArgs.options.sdispls[(sendRank)*collArgs.totalRanks+recvRank] + collArgs.options.sendcounts[(sendRank)*collArgs.totalRanks+recvRank];
+      maxNumElements = std::max(maxNumElements, rankSendCount);
+    }
+
+    // Clear outputs on all ranks (prior to input in case of in-place)
+    collArgs.outputGpu.ClearGpuMem(numOutputBytes);
+
+    // Generate input on root rank - each rank will receive a portion
+    PtrUnion tempInput;
+    tempInput.AllocateCpuMem(maxNumElements*DataTypeToBytes(collArgs.dataType));
+
+    for (int sendRank = 0; sendRank < collArgs.totalRanks; ++sendRank)
+    {
+      tempInput.FillPattern(collArgs.dataType, maxNumElements, sendRank, false);
+      size_t recvDspls = collArgs.options.rdispls[collArgs.globalRank*collArgs.totalRanks + sendRank] * DataTypeToBytes(collArgs.dataType);
+      size_t sendDspls = collArgs.options.sdispls[sendRank*collArgs.totalRanks + collArgs.globalRank] * DataTypeToBytes(collArgs.dataType);
+      size_t numBytes = collArgs.options.recvcounts[collArgs.globalRank*collArgs.totalRanks + sendRank] * DataTypeToBytes(collArgs.dataType);
+      memcpy(collArgs.expected.U1 + recvDspls, tempInput.U1 + sendDspls, numBytes);
+    }
+    tempInput.FillPattern(collArgs.dataType, collArgs.numInputElements, collArgs.globalRank, false);
+
+    CHECK_HIP(hipMemcpy(collArgs.inputGpu.ptr, tempInput.ptr, numInputBytes, hipMemcpyHostToDevice));
+
+    tempInput.FreeCpuMem();
+    return TEST_SUCCESS;
+  }
+
   ErrCode DefaultPrepData_Send(CollectiveArgs &collArgs)
   {
     CHECK_CALL(CheckAllocation(collArgs));
@@ -354,7 +394,7 @@ namespace RcclUnitTesting
     CHECK_CALL(CheckAllocation(collArgs));
     return collArgs.expected.FillPattern(collArgs.dataType,
                                          collArgs.numOutputElements,
-                                         collArgs.root,
+                                         collArgs.options.root,
                                          false);
   }
 }
diff --git a/test/common/PrepDataFuncs.hpp b/test/common/PrepDataFuncs.hpp
index 1dc7f0dd3a..d610b917cd 100644
--- a/test/common/PrepDataFuncs.hpp
+++ b/test/common/PrepDataFuncs.hpp
@@ -22,6 +22,7 @@ namespace RcclUnitTesting
   ErrCode DefaultPrepData_ReduceScatter(CollectiveArgs &collArgs);
   ErrCode DefaultPrepData_Scatter(CollectiveArgs &collArgs);
   ErrCode DefaultPrepData_AllToAll(CollectiveArgs &collArgs);
+  ErrCode DefaultPrepData_AllToAllv(CollectiveArgs &collArgs);
   ErrCode DefaultPrepData_Send(CollectiveArgs &collArgs);
   ErrCode DefaultPrepData_Recv(CollectiveArgs &collArgs);
 }
diff --git a/test/common/TestBed.cpp b/test/common/TestBed.cpp
index edbab4faec..d3a962ac7f 100644
--- a/test/common/TestBed.cpp
+++ b/test/common/TestBed.cpp
@@ -155,32 +155,19 @@ namespace RcclUnitTesting
     InitComms(TestBed::GetDeviceIdsList(1, numGpus), numCollectivesInGroup);
   }
 
-  void TestBed::SetCollectiveArgs(ncclFunc_t     const funcType,
-                                  ncclDataType_t const dataType,
-                                  ncclRedOp_t    const redOp,
-                                  int            const root,
-                                  size_t         const numInputElements,
-                                  size_t         const numOutputElements,
-                                  int            const collId,
-                                  int            const rank,
-                                  PtrUnion       const scalarsPerRank,
-                                  int            const scalarMode)
+  void TestBed::SetCollectiveArgs(ncclFunc_t      const funcType,
+                                  ncclDataType_t  const dataType,
+                                  size_t          const numInputElements,
+                                  size_t          const numOutputElements,
+                                  OptionalColArgs const &optionalArgs,
+                                  int             const collId,
+                                  int             const rank)
   {
     // Build list of ranks this applies to (-1 for rank means to set for all)
     std::vector<int> rankList;
     for (int i = 0; i < this->numActiveRanks; ++i)
       if (rank == -1 || rank == i) rankList.push_back(i);
 
-    ScalarTransport scalarTransport;
-    if (scalarMode >= 0)
-    {
-      ASSERT_TRUE(scalarsPerRank.ptr != NULL);
-
-      // Capture scalars per rank in format to share with child processes
-      int const numBytes = this->numActiveRanks * DataTypeToBytes(dataType);
-      memcpy(scalarTransport.ptr, scalarsPerRank.ptr, numBytes);
-    }
-
     // Loop over all ranks and send CollectiveArgs to appropriate child process
     int const cmd = TestBedChild::CHILD_SET_COLL_ARGS;
     for (auto currRank : rankList)
@@ -191,12 +178,9 @@ namespace RcclUnitTesting
       PIPE_WRITE(childId, collId);
       PIPE_WRITE(childId, funcType);
       PIPE_WRITE(childId, dataType);
-      PIPE_WRITE(childId, redOp);
-      PIPE_WRITE(childId, root);
       PIPE_WRITE(childId, numInputElements);
       PIPE_WRITE(childId, numOutputElements);
-      PIPE_WRITE(childId, scalarMode);
-      PIPE_WRITE(childId, scalarTransport);
+      PIPE_WRITE(childId, optionalArgs);
       PIPE_CHECK(childId);
     }
   }
@@ -412,7 +396,7 @@ namespace RcclUnitTesting
     // Sort numElements in descending order to cut down on # of allocations
     std::vector<int> sortedN = numElements;
     std::sort(sortedN.rbegin(), sortedN.rend());
-
+    OptionalColArgs optionalArgs;
     // Filter out any unsupported datatypes, in case only subset has been compiled for
     std::vector<ncclDataType_t> const& supportedDataTypes = this->GetAllSupportedDataTypes();
     std::vector<ncclDataType_t> dataTypes;
@@ -479,13 +463,13 @@ namespace RcclUnitTesting
                                                     totalRanks,
                                                     &numInputElements,
                                                     &numOutputElements);
-
+          optionalArgs.redOp = redOps[rdIdx];
+          optionalArgs.root = roots[rtIdx];
           this->SetCollectiveArgs(funcTypes[ftIdx],
                                   dataTypes[dtIdx],
-                                  redOps[rdIdx],
-                                  roots[rtIdx],
                                   numInputElements,
-                                  numOutputElements);
+                                  numOutputElements,
+                                  optionalArgs);
 
           // Only allocate once for largest size
           if (neIdx == 0) this->AllocateMem(inPlaceList[ipIdx], managedMemList[mmIdx]);
diff --git a/test/common/TestBed.hpp b/test/common/TestBed.hpp
index f06bb031ee..73e4070fdd 100644
--- a/test/common/TestBed.hpp
+++ b/test/common/TestBed.hpp
@@ -41,16 +41,13 @@ namespace RcclUnitTesting
     // Using collId = -1 (default) applies settings to all collectives in group
     // Using rank = -1 (default) applies settings to all ranks
 
-    void SetCollectiveArgs(ncclFunc_t     const funcType,
-                           ncclDataType_t const dataType,
-                           ncclRedOp_t    const redOp,
-                           int            const root,
-                           size_t         const numInputElements,
-                           size_t         const numOutputElements,
-                           int            const collId = -1,
-                           int            const rank = -1,
-                           PtrUnion       const scalarsPerRank = {nullptr},
-                           int            const scalarMode = -1);
+    void SetCollectiveArgs(ncclFunc_t      const funcType,
+                           ncclDataType_t  const dataType,
+                           size_t          const numInputElements,
+                           size_t          const numOutputElements,
+                           OptionalColArgs const &optionalArgs = {},
+                           int             const collId = -1,
+                           int             const rank = -1);
 
     // Allocate memory for specified collective / rank
     // - Requires SetCollectiveArgs to have been called already
diff --git a/test/common/TestBedChild.cpp b/test/common/TestBedChild.cpp
index a291f741c6..ceaa9522c2 100644
--- a/test/common/TestBedChild.cpp
+++ b/test/common/TestBedChild.cpp
@@ -188,29 +188,17 @@ namespace RcclUnitTesting
     int             collId;
     ncclFunc_t      funcType;
     ncclDataType_t  dataType;
-    ncclRedOp_t     redOp;
-    int             root;
     size_t          numInputElements;
     size_t          numOutputElements;
-    ScalarTransport scalarTransport;
-    int             scalarMode;
+    OptionalColArgs options;
 
     PIPE_READ(globalRank);
     PIPE_READ(collId);
     PIPE_READ(funcType);
     PIPE_READ(dataType);
-    PIPE_READ(redOp);
-    PIPE_READ(root);
     PIPE_READ(numInputElements);
     PIPE_READ(numOutputElements);
-    PIPE_READ(scalarMode);
-    PIPE_READ(scalarTransport);
-
-    for (int i = 0; i < this->totalRanks; i++)
-    {
-      PtrUnion scalarsPerRank;
-      scalarsPerRank.Attach(scalarTransport.ptr);
-    }
+    PIPE_READ(options);
 
     if (globalRank < this->rankOffset || (this->rankOffset + comms.size() <= globalRank))
     {
@@ -227,24 +215,24 @@ namespace RcclUnitTesting
         CollectiveArgs& collArg = this->collArgs[localRank][collIdx];
         CHECK_CALL(collArg.SetArgs(globalRank, this->totalRanks,
                                    this->deviceIds[localRank],
-                                   funcType, dataType, redOp, root,
+                                   funcType, dataType,
                                    numInputElements, numOutputElements,
-                                   scalarTransport, scalarMode));
+                                   options));
         if (this->verbose) INFO("Rank %d on child %d sets collective %d [%s]\n",
                                 globalRank, this->childId, collIdx,
                                 collArg.GetDescription().c_str());
 
         // If pre-mult scalars are provided, then create a custom reduction operator
-        if (scalarMode >= 0)
+        if (options.scalarMode >= 0)
         {
-          CHILD_NCCL_CALL(ncclRedOpCreatePreMulSum(&collArg.redOp,
+          CHILD_NCCL_CALL(ncclRedOpCreatePreMulSum(&collArg.options.redOp,
                                                    collArg.localScalar.ptr,
                                                    dataType,
-                                                   (ncclScalarResidence_t)scalarMode,
+                                                   (ncclScalarResidence_t)options.scalarMode,
                                                    this->comms[localRank]),
                           "ncclRedOpCreatePreMulSum");
           if (verbose) INFO("Child %d created custom redop %d for collective %d\n",
-                            this->childId, collArg.redOp, collIdx);
+                            this->childId, collArg.options.redOp, collIdx);
         }
       }
     }
@@ -383,7 +371,7 @@ namespace RcclUnitTesting
                                         collArg.outputGpu.ptr,
                                         collArg.numInputElements,
                                         collArg.dataType,
-                                        collArg.root,
+                                        collArg.options.root,
                                         this->comms[localRank],
                                         this->streams[localRank]),
                           "ncclBroadcast");
@@ -393,8 +381,8 @@ namespace RcclUnitTesting
                                      collArg.outputGpu.ptr,
                                      collArg.numInputElements,
                                      collArg.dataType,
-                                     collArg.redOp,
-                                     collArg.root,
+                                     collArg.options.redOp,
+                                     collArg.options.root,
                                      this->comms[localRank],
                                      this->streams[localRank]),
                           "ncclReduce");
@@ -413,7 +401,7 @@ namespace RcclUnitTesting
                                             collArg.outputGpu.ptr,
                                             collArg.numOutputElements,
                                             collArg.dataType,
-                                            collArg.redOp,
+                                            collArg.options.redOp,
                                             this->comms[localRank],
                                             this->streams[localRank]),
                           "ncclReduceScatter");
@@ -423,7 +411,7 @@ namespace RcclUnitTesting
                                         collArg.outputGpu.ptr,
                                         collArg.numInputElements,
                                         collArg.dataType,
-                                        collArg.redOp,
+                                        collArg.options.redOp,
                                         this->comms[localRank],
                                         this->streams[localRank]),
                           "ncclAllReduce");
@@ -433,7 +421,7 @@ namespace RcclUnitTesting
                                      collArg.outputGpu.ptr,
                                      collArg.numInputElements,
                                      collArg.dataType,
-                                     collArg.root,
+                                     collArg.options.root,
                                      this->comms[localRank],
                                      this->streams[localRank]),
                           "ncclGather");
@@ -443,7 +431,7 @@ namespace RcclUnitTesting
                                       collArg.outputGpu.ptr,
                                       collArg.numOutputElements,
                                       collArg.dataType,
-                                      collArg.root,
+                                      collArg.options.root,
                                       this->comms[localRank],
                                       this->streams[localRank]),
                           "ncclScatter");
@@ -457,11 +445,23 @@ namespace RcclUnitTesting
                                        this->streams[localRank]),
                           "ncclAllToAll");
           break;
+        case ncclCollAllToAllv:
+          CHILD_NCCL_CALL(ncclAllToAllv(collArg.inputGpu.ptr,
+                                        collArg.options.sendcounts + (this->rankOffset + localRank)*this->totalRanks,
+                                        collArg.options.sdispls + (this->rankOffset + localRank)*this->totalRanks,
+                                        collArg.outputGpu.ptr,
+                                        collArg.options.recvcounts + (this->rankOffset + localRank)*this->totalRanks,
+                                        collArg.options.rdispls + (this->rankOffset + localRank)*this->totalRanks,
+                                        collArg.dataType,
+                                        this->comms[localRank],
+                                        this->streams[localRank]),
+                          "ncclAllToAllv");
+          break;
         case ncclCollSend:
           CHILD_NCCL_CALL(ncclSend(collArg.inputGpu.ptr,
                                    collArg.numInputElements,
                                    collArg.dataType,
-                                   collArg.root,
+                                   collArg.options.root,
                                    this->comms[localRank],
                                    this->streams[localRank]),
                           "ncclSend");
@@ -470,7 +470,7 @@ namespace RcclUnitTesting
           CHILD_NCCL_CALL(ncclRecv(collArg.outputGpu.ptr,
                                    collArg.numOutputElements,
                                    collArg.dataType,
-                                   collArg.root,
+                                   collArg.options.root,
                                    this->comms[localRank],
                                    this->streams[localRank]),
                           "ncclRecv");
@@ -579,12 +579,12 @@ namespace RcclUnitTesting
 
         CHECK_CALL(collArg.DeallocateMem());
       }
-      if (collArg.scalarMode != -1)
+      if (collArg.options.scalarMode != -1)
       {
-        CHILD_NCCL_CALL(ncclRedOpDestroy(collArg.redOp, this->comms[localRank]),
+        CHILD_NCCL_CALL(ncclRedOpDestroy(collArg.options.redOp, this->comms[localRank]),
                         "ncclRedOpDestroy");
         if (verbose) INFO("Child %d destroys custom redop %d for collective %d\n",
-                          this->childId, collArg.redOp, collIdx);
+                          this->childId, collArg.options.redOp, collIdx);
       }
     }
     if (this->verbose) INFO("Child %d finishes DeallocateMem\n", this->childId);