diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h index 28e86c3ca9..c5092cf52a 100644 --- a/src/collectives/device/common_kernel.h +++ b/src/collectives/device/common_kernel.h @@ -343,8 +343,12 @@ __device__ void ReduceCopy128bMulti( const int w, const int nw, const int t, } } +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) template +__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(int32_t); } +#else __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); } +#endif // Try to limit consecutive load/stores to 8. // Use UNROLL 8 when we have a single source and a single destination, 4 otherwise @@ -366,9 +370,15 @@ __device__ void ReduceOrCopyMulti(const int tid, const int nthreads, for (int i=0; i numDevicesAvailable) return; + + // Allocate data + Dataset dataset; + dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllToAll); + + // Prepare input / output / expected results + FillDatasetWithPattern(dataset); + ComputeExpectedResults(dataset); + + size_t chunksize = numElements*2/numDevices; + #define MAX_ALLTOALLV_RANKS 16 + static size_t sendcounts[MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS]; + // Launch the reduction (1 thread per GPU) + ncclGroupStart(); + for (int r = 0; r < numDevices; r++) { + size_t disp = 0; + for (int i = 0; i < numDevices; i++) { + size_t scount = ((i+r)%numDevices)*chunksize; + if (i+r == numDevices-1) + scount += (numElements*numDevices-chunksize*(numDevices-1)*numDevices/2); + sendcounts[i] = recvcounts[i] = scount; + sdispls[i] = rdispls[i] = disp; + disp += scount; + } + for (int i = 0; i < numDevices; i++) { + if (sendcounts[i] != 0) { + ncclSend( + ((char*)dataset.inputs[r]) + sdispls[i] * DataTypeToBytes(dataType), + sendcounts[i], + dataType, + i, + comms[r], + streams[r]); + } + if (recvcounts[i] != 0) { + ncclRecv( + ((char*)dataset.outputs[r]) + rdispls[i] * DataTypeToBytes(dataType), + recvcounts[i], + dataType, + i, + comms[r], + streams[r]); + } + } + } + ncclGroupEnd(); + + // Wait for reduction to complete + Synchronize(); + + // Check results + ValidateResults(dataset); + + dataset.Release(); + } + + INSTANTIATE_TEST_SUITE_P(AllToAllvCorrectnessSweep, + AllToAllvCorrectnessTest, + testing::Combine( + // Reduction operator is not used + testing::Values(ncclSum), + // Data types + testing::Values(ncclInt8, + ncclUint8, + ncclInt32, + ncclUint32, + ncclInt64, + ncclUint64, + //ncclFloat16, + ncclFloat32, + ncclFloat64, + ncclBfloat16), + // Number of elements + testing::Values(2520, 3026520), + // Number of devices + testing::Values(2,3,4,5,6,7,8), + // In-place or not + testing::Values(false), + testing::Values("")), + CorrectnessTest::PrintToStringParamName()); +} // namespace diff --git a/test/test_AllToAllv.hpp b/test/test_AllToAllv.hpp new file mode 100644 index 0000000000..e5ebfb1955 --- /dev/null +++ b/test/test_AllToAllv.hpp @@ -0,0 +1,44 @@ +/************************************************************************* + * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#ifndef TEST_ALLTOALLV_HPP +#define TEST_ALLTOALLV_HPP + +#include "CorrectnessTest.hpp" + +namespace CorrectnessTests +{ + class AllToAllvCorrectnessTest : public CorrectnessTest + { + public: + static void ComputeExpectedResults(Dataset& dataset) + { + for (int i = 0; i < dataset.numDevices; i++) { + size_t rdisp = 0; + size_t chunksize = dataset.numElements*2/dataset.numDevices; + for (int j = 0; j < dataset.numDevices; j++) { + size_t scount = 0, rcount = ((j+i)%dataset.numDevices)*chunksize; + if (j+i == dataset.numDevices-1) + rcount += (dataset.numElements*dataset.numDevices-chunksize*(dataset.numDevices-1)*dataset.numDevices/2); + size_t sdisp = 0; + for (int k=0; k