From 4751992231f9b4d46f486bfaf4762a1570d91381 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Tue, 1 Sep 2020 21:05:37 +0000 Subject: [PATCH 1/2] Make data alignment requirements matching ISA manual From https://developer.amd.com/wp-content/resources/Vega_Shader_ISA.pdf 8.1.7. Alignment For Dword or larger reads or writes, the two LSBs of the byte-address are ignored, thus forcing Dword alignment. --- src/collectives/device/common_kernel.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h index 28e86c3ca9..c5092cf52a 100644 --- a/src/collectives/device/common_kernel.h +++ b/src/collectives/device/common_kernel.h @@ -343,8 +343,12 @@ __device__ void ReduceCopy128bMulti( const int w, const int nw, const int t, } } +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) template +__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(int32_t); } +#else __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); } +#endif // Try to limit consecutive load/stores to 8. // Use UNROLL 8 when we have a single source and a single destination, 4 otherwise @@ -366,9 +370,15 @@ __device__ void ReduceOrCopyMulti(const int tid, const int nthreads, for (int i=0; i Date: Tue, 1 Sep 2020 22:32:14 +0000 Subject: [PATCH 2/2] gtest: add alltoallv test --- test/CMakeLists.txt | 3 +- test/test_AllToAllv.cpp | 94 +++++++++++++++++++++++++++++++++++++++++ test/test_AllToAllv.hpp | 44 +++++++++++++++++++ 3 files changed, 140 insertions(+), 1 deletion(-) create mode 100644 test/test_AllToAllv.cpp create mode 100644 test/test_AllToAllv.hpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 456b13f25b..4a01e3746e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -20,7 +20,8 @@ if(BUILD_TESTS) test_BroadcastAbort.cpp test_Scatter.cpp test_Gather.cpp - test_AllToAll.cpp + test_AllToAll.cpp + test_AllToAllv.cpp ) add_executable(UnitTests ${TEST_SOURCES}) diff --git a/test/test_AllToAllv.cpp b/test/test_AllToAllv.cpp new file mode 100644 index 0000000000..8223e4e9c8 --- /dev/null +++ b/test/test_AllToAllv.cpp @@ -0,0 +1,94 @@ +/************************************************************************* + * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "test_AllToAllv.hpp" + +namespace CorrectnessTests +{ + TEST_P(AllToAllvCorrectnessTest, Correctness) + { + if (numDevices > numDevicesAvailable) return; + + // Allocate data + Dataset dataset; + dataset.Initialize(numDevices, numElements, dataType, inPlace, ncclCollAllToAll); + + // Prepare input / output / expected results + FillDatasetWithPattern(dataset); + ComputeExpectedResults(dataset); + + size_t chunksize = numElements*2/numDevices; + #define MAX_ALLTOALLV_RANKS 16 + static size_t sendcounts[MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS]; + // Launch the reduction (1 thread per GPU) + ncclGroupStart(); + for (int r = 0; r < numDevices; r++) { + size_t disp = 0; + for (int i = 0; i < numDevices; i++) { + size_t scount = ((i+r)%numDevices)*chunksize; + if (i+r == numDevices-1) + scount += (numElements*numDevices-chunksize*(numDevices-1)*numDevices/2); + sendcounts[i] = recvcounts[i] = scount; + sdispls[i] = rdispls[i] = disp; + disp += scount; + } + for (int i = 0; i < numDevices; i++) { + if (sendcounts[i] != 0) { + ncclSend( + ((char*)dataset.inputs[r]) + sdispls[i] * DataTypeToBytes(dataType), + sendcounts[i], + dataType, + i, + comms[r], + streams[r]); + } + if (recvcounts[i] != 0) { + ncclRecv( + ((char*)dataset.outputs[r]) + rdispls[i] * DataTypeToBytes(dataType), + recvcounts[i], + dataType, + i, + comms[r], + streams[r]); + } + } + } + ncclGroupEnd(); + + // Wait for reduction to complete + Synchronize(); + + // Check results + ValidateResults(dataset); + + dataset.Release(); + } + + INSTANTIATE_TEST_SUITE_P(AllToAllvCorrectnessSweep, + AllToAllvCorrectnessTest, + testing::Combine( + // Reduction operator is not used + testing::Values(ncclSum), + // Data types + testing::Values(ncclInt8, + ncclUint8, + ncclInt32, + ncclUint32, + ncclInt64, + ncclUint64, + //ncclFloat16, + ncclFloat32, + ncclFloat64, + ncclBfloat16), + // Number of elements + testing::Values(2520, 3026520), + // Number of devices + testing::Values(2,3,4,5,6,7,8), + // In-place or not + testing::Values(false), + testing::Values("")), + CorrectnessTest::PrintToStringParamName()); +} // namespace diff --git a/test/test_AllToAllv.hpp b/test/test_AllToAllv.hpp new file mode 100644 index 0000000000..e5ebfb1955 --- /dev/null +++ b/test/test_AllToAllv.hpp @@ -0,0 +1,44 @@ +/************************************************************************* + * Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#ifndef TEST_ALLTOALLV_HPP +#define TEST_ALLTOALLV_HPP + +#include "CorrectnessTest.hpp" + +namespace CorrectnessTests +{ + class AllToAllvCorrectnessTest : public CorrectnessTest + { + public: + static void ComputeExpectedResults(Dataset& dataset) + { + for (int i = 0; i < dataset.numDevices; i++) { + size_t rdisp = 0; + size_t chunksize = dataset.numElements*2/dataset.numDevices; + for (int j = 0; j < dataset.numDevices; j++) { + size_t scount = 0, rcount = ((j+i)%dataset.numDevices)*chunksize; + if (j+i == dataset.numDevices-1) + rcount += (dataset.numElements*dataset.numDevices-chunksize*(dataset.numDevices-1)*dataset.numDevices/2); + size_t sdisp = 0; + for (int k=0; k