From 4751992231f9b4d46f486bfaf4762a1570d91381 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Tue, 1 Sep 2020 21:05:37 +0000
Subject: [PATCH 1/2] Make data alignment requirements matching ISA manual

From https://developer.amd.com/wp-content/resources/Vega_Shader_ISA.pdf

8.1.7. Alignment
For Dword or larger reads or writes, the two LSBs of the byte-address
are ignored, thus forcing Dword alignment.
---
 src/collectives/device/common_kernel.h | 10 ++++++++++
 1 file changed, 10 insertions(+)
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index 28e86c3ca9..c5092cf52a 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -343,8 +343,12 @@ __device__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
   }
 }
 
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 template <typename T>
+__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(int32_t); }
+#else
 __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
+#endif
 
 // Try to limit consecutive load/stores to 8.
 // Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
@@ -366,9 +370,15 @@ __device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
   for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
   for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
 
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+  int Npreamble = alignDiff ? Nrem :
+    N < alignof(int32_t) ? N :
+    (alignof(int32_t) - align) % alignof(int32_t);
+#else
   int Npreamble = alignDiff ? Nrem :
     N < alignof(Pack128) ? N :
     (alignof(Pack128) - align) % alignof(Pack128);
+#endif
 
   // stage 1: preamble: handle any elements up to the point of everything coming
   // into alignment

From b163a8898f9bdc86a6ec0624ecec023d8c66b587 Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Tue, 1 Sep 2020 22:32:14 +0000
Subject: [PATCH 2/2] gtest: add alltoallv test

---
 test/CMakeLists.txt     |  3 +-
 test/test_AllToAllv.cpp | 94 +++++++++++++++++++++++++++++++++++++++++
 test/test_AllToAllv.hpp | 44 +++++++++++++++++++
 3 files changed, 140 insertions(+), 1 deletion(-)
 create mode 100644 test/test_AllToAllv.cpp
 create mode 100644 test/test_AllToAllv.hpp

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 456b13f25b..4a01e3746e 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -20,7 +20,8 @@ if(BUILD_TESTS)
     test_BroadcastAbort.cpp
     test_Scatter.cpp
     test_Gather.cpp
-    test_AllToAll.cpp    
+    test_AllToAll.cpp
+    test_AllToAllv.cpp
   )
 
   add_executable(UnitTests ${TEST_SOURCES})
diff --git a/test/test_AllToAllv.cpp b/test/test_AllToAllv.cpp
new file mode 100644
index 0000000000..8223e4e9c8
--- /dev/null
+++ b/test/test_AllToAllv.cpp
@@ -0,0 +1,94 @@
+/*************************************************************************
+ * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "test_AllToAllv.hpp"
+
+namespace CorrectnessTests
+{
+    TEST_P(AllToAllvCorrectnessTest, Correctness)
+    {
+        if (numDevices > numDevicesAvailable) return;
+
+        // Allocate data
+        Dataset dataset;
+        dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllToAll);
+
+        // Prepare input / output / expected results
+        FillDatasetWithPattern(dataset);
+        ComputeExpectedResults(dataset);
+
+        size_t chunksize = numElements*2/numDevices;
+        #define MAX_ALLTOALLV_RANKS 16
+        static size_t sendcounts[MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS];
+        // Launch the reduction (1 thread per GPU)
+        ncclGroupStart();
+        for (int r = 0; r < numDevices; r++) {
+            size_t disp = 0;
+            for (int i = 0; i < numDevices; i++) {
+                size_t scount = ((i+r)%numDevices)*chunksize;
+                if (i+r == numDevices-1)
+                  scount += (numElements*numDevices-chunksize*(numDevices-1)*numDevices/2);
+                sendcounts[i] = recvcounts[i] = scount;
+                sdispls[i] = rdispls[i] = disp;
+                disp += scount;
+            }
+            for (int i = 0; i < numDevices; i++) {
+                if (sendcounts[i] != 0) {
+                  ncclSend(
+                      ((char*)dataset.inputs[r]) + sdispls[i] * DataTypeToBytes(dataType),
+                      sendcounts[i],
+                      dataType,
+                      i,
+                      comms[r],
+                      streams[r]);
+                }
+                if (recvcounts[i] != 0) {
+                  ncclRecv(
+                      ((char*)dataset.outputs[r]) + rdispls[i] * DataTypeToBytes(dataType),
+                      recvcounts[i],
+                      dataType,
+                      i,
+                      comms[r],
+                      streams[r]);
+                }
+            }
+        }
+        ncclGroupEnd();
+
+        // Wait for reduction to complete
+        Synchronize();
+
+        // Check results
+        ValidateResults(dataset);
+
+        dataset.Release();
+    }
+
+    INSTANTIATE_TEST_SUITE_P(AllToAllvCorrectnessSweep,
+                            AllToAllvCorrectnessTest,
+                            testing::Combine(
+                                // Reduction operator is not used
+                                testing::Values(ncclSum),
+                                // Data types
+                                testing::Values(ncclInt8,
+                                                ncclUint8,
+                                                ncclInt32,
+                                                ncclUint32,
+                                                ncclInt64,
+                                                ncclUint64,
+                                                //ncclFloat16,
+                                                ncclFloat32,
+                                                ncclFloat64,
+                                                ncclBfloat16),
+                                // Number of elements
+                                testing::Values(2520, 3026520),
+                                // Number of devices
+                                testing::Values(2,3,4,5,6,7,8),
+                                // In-place or not
+                                testing::Values(false),
+                                testing::Values("")),
+                            CorrectnessTest::PrintToStringParamName());
+} // namespace
diff --git a/test/test_AllToAllv.hpp b/test/test_AllToAllv.hpp
new file mode 100644
index 0000000000..e5ebfb1955
--- /dev/null
+++ b/test/test_AllToAllv.hpp
@@ -0,0 +1,44 @@
+/*************************************************************************
+ * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef TEST_ALLTOALLV_HPP
+#define TEST_ALLTOALLV_HPP
+
+#include "CorrectnessTest.hpp"
+
+namespace CorrectnessTests
+{
+    class AllToAllvCorrectnessTest : public CorrectnessTest
+    {
+    public:
+        static void ComputeExpectedResults(Dataset& dataset)
+        {
+            for (int i = 0; i < dataset.numDevices; i++) {
+                size_t rdisp = 0;
+                size_t chunksize = dataset.numElements*2/dataset.numDevices;
+                for (int j = 0; j < dataset.numDevices; j++) {
+                    size_t scount = 0, rcount = ((j+i)%dataset.numDevices)*chunksize;
+                    if (j+i == dataset.numDevices-1)
+                        rcount += (dataset.numElements*dataset.numDevices-chunksize*(dataset.numDevices-1)*dataset.numDevices/2);
+                    size_t sdisp = 0;
+                    for (int k=0; k<dataset.numDevices; k++) {
+                        scount = ((k+j)%dataset.numDevices)*chunksize;
+                        if (k+j == dataset.numDevices-1)
+                          scount += (dataset.numElements*dataset.numDevices-chunksize*(dataset.numDevices-1)*dataset.numDevices/2);
+                        if (k == i)
+                          break;
+                        sdisp += scount;
+                    }
+                    HIP_CALL(hipMemcpy((int8_t *)dataset.expected[i]+rdisp*DataTypeToBytes(dataset.dataType),
+                        (int8_t *)dataset.inputs[j]+sdisp*DataTypeToBytes(dataset.dataType),
+                        rcount*DataTypeToBytes(dataset.dataType), hipMemcpyDeviceToHost));
+                    rdisp += rcount;
+                }
+            }
+        }
+    };
+}
+
+#endif