From ec1f43cd0530ec56fd7213e44cb4f48d3ddf268e Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe Date: Thu, 21 Mar 2024 16:28:33 +0530 Subject: [PATCH] SWDEV-453422: introduce warp sync tests The following builtins are tested: - __all_sync, __any_sync, __ballot_sync and __activemask - __match_any_sync and __match_all_sync - __shfl_sync, __shfl_up_sync, __shfl_down_sync, and __shfl_xor_sync The tests for shfl (all variants) were manually validated on a CUDA setup. NOTE: - Unit_hipVoteSync_All temporarily disabled on Windows (SWDEV-452308). - All new tests temporarily disabled on CUDA (SWDEV-453145). Change-Id: I84b205a88aa24219d199c760793e2f19f0cf8f13 [ROCm/hip-tests commit: 53f0a9bd014cb3e76e39d020688987ed1c5f20b6] --- .../hipTestMain/config/config_amd_windows | 2 + .../catch/include/hip_test_common.hh | 15 + .../catch/unit/kernel/CMakeLists.txt | 2 - .../hip-tests/catch/unit/rtc/CMakeLists.txt | 1 + projects/hip-tests/catch/unit/rtc/shfl.cc | 42 +- projects/hip-tests/catch/unit/rtc/shfl.hh | 64 ++ .../hip-tests/catch/unit/rtc/shfl_sync.cc | 161 +++++ .../hip-tests/catch/unit/warp/CMakeLists.txt | 12 + .../catch/unit/warp/hipMatchSyncAllTests.cc | 281 +++++++++ .../catch/unit/warp/hipMatchSyncAnyTests.cc | 196 ++++++ .../catch/unit/warp/hipShflSyncDownTests.cc | 251 ++++++++ .../catch/unit/warp/hipShflSyncTests.cc | 176 ++++++ .../catch/unit/warp/hipShflSyncUpTests.cc | 243 +++++++ .../catch/unit/warp/hipShflSyncXorTests.cc | 234 +++++++ .../unit/{kernel => warp}/hipShflTests.cc | 0 .../{kernel => warp}/hipShflUpDownTest.cc | 0 .../catch/unit/warp/hipVoteSyncTests.cc | 595 ++++++++++++++++++ .../hip-tests/catch/unit/warp/warp_common.hh | 90 ++- 18 files changed, 2318 insertions(+), 47 deletions(-) create mode 100644 projects/hip-tests/catch/unit/rtc/shfl.hh create mode 100644 projects/hip-tests/catch/unit/rtc/shfl_sync.cc create mode 100644 projects/hip-tests/catch/unit/warp/hipMatchSyncAllTests.cc create mode 100644 projects/hip-tests/catch/unit/warp/hipMatchSyncAnyTests.cc create mode 100644 projects/hip-tests/catch/unit/warp/hipShflSyncDownTests.cc create mode 100644 projects/hip-tests/catch/unit/warp/hipShflSyncTests.cc create mode 100644 projects/hip-tests/catch/unit/warp/hipShflSyncUpTests.cc create mode 100644 projects/hip-tests/catch/unit/warp/hipShflSyncXorTests.cc rename projects/hip-tests/catch/unit/{kernel => warp}/hipShflTests.cc (100%) rename projects/hip-tests/catch/unit/{kernel => warp}/hipShflUpDownTest.cc (100%) create mode 100644 projects/hip-tests/catch/unit/warp/hipVoteSyncTests.cc diff --git a/projects/hip-tests/catch/hipTestMain/config/config_amd_windows b/projects/hip-tests/catch/hipTestMain/config/config_amd_windows index a10edd2be8..c9090bac58 100644 --- a/projects/hip-tests/catch/hipTestMain/config/config_amd_windows +++ b/projects/hip-tests/catch/hipTestMain/config/config_amd_windows @@ -1435,6 +1435,8 @@ "=== SWDEV-453453 : Below tests failed in stress test on 22/03/24 ===", "Unit_hipDeviceGetGraphMemAttribute_Functional", "Unit_hipDeviceGetGraphMemAttribute_Functional_Multi_Device", + "=== SWDEV-452308 : Windows failure, should be fixed for ROCm 6.3. ===", + "Unit_hipVoteSync_All", #endif "End of json" ] diff --git a/projects/hip-tests/catch/include/hip_test_common.hh b/projects/hip-tests/catch/include/hip_test_common.hh index 21707f7615..06be166269 100644 --- a/projects/hip-tests/catch/include/hip_test_common.hh +++ b/projects/hip-tests/catch/include/hip_test_common.hh @@ -170,6 +170,21 @@ static void initHipCtx(hipCtx_t* pcontext) { #define HIP_ARRAY hipArray_t #endif +static inline int getWarpSize() { +#if HT_NVIDIA + return 32; +#elif HT_AMD + int device = -1; + int warpSize = -1; + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK(hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, device)); + return warpSize; +#else + std::cout<<"Have to be either Nvidia or AMD platform, asserting"< #include #include - -static constexpr int n = 32; +#include "shfl.hh" static constexpr auto shfl { R"( @@ -63,45 +62,6 @@ __global__ void shflXorSum(T* a, int size) { } )"}; -void getFactor(int& fact) { fact = 101; } -void getFactor(__half& fact) { fact = 2.5; } - -template T sum(T* a) { - T cpuSum = 0; - T factor; - getFactor(factor); - for (int i = 0; i < n; i++) { - a[i] = i + factor; - cpuSum += a[i]; - } - return cpuSum; -} - -template bool compare(T gpuSum, T cpuSum) { - if (gpuSum != cpuSum) { - return true; - } - return false; -} - -template <> __half sum(__half* a) { - __half cpuSum = 0; - __half factor; - getFactor(factor); - for (int i = 0; i < n; i++) { - a[i] = i + __half2float(factor); - cpuSum = __half2float(cpuSum) + __half2float(a[i]); - } - return cpuSum; -} - -template <> bool compare(__half gpuSum, __half cpuSum) { - if (__half2float(gpuSum) != __half2float(cpuSum)) { - return true; - } - return false; -} - template void runTestShfl(int option) { using namespace std; diff --git a/projects/hip-tests/catch/unit/rtc/shfl.hh b/projects/hip-tests/catch/unit/rtc/shfl.hh new file mode 100644 index 0000000000..b21674f6d4 --- /dev/null +++ b/projects/hip-tests/catch/unit/rtc/shfl.hh @@ -0,0 +1,64 @@ +/* +Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +static constexpr int n = 32; + +inline void getFactor(int& fact) { fact = 101; } +inline void getFactor(__half& fact) { fact = 2.5; } + +template inline T sum(T* a) { + T cpuSum = 0; + T factor; + getFactor(factor); + for (int i = 0; i < n; i++) { + a[i] = i + factor; + cpuSum += a[i]; + } + return cpuSum; +} + +template inline bool compare(T gpuSum, T cpuSum) { + if (gpuSum != cpuSum) { + return true; + } + return false; +} + +template <> inline __half sum(__half* a) { + __half cpuSum = 0; + __half factor; + getFactor(factor); + for (int i = 0; i < n; i++) { + a[i] = i + __half2float(factor); + cpuSum = __half2float(cpuSum) + __half2float(a[i]); + } + return cpuSum; +} + +template <> inline bool compare(__half gpuSum, __half cpuSum) { + if (__half2float(gpuSum) != __half2float(cpuSum)) { + return true; + } + return false; +} diff --git a/projects/hip-tests/catch/unit/rtc/shfl_sync.cc b/projects/hip-tests/catch/unit/rtc/shfl_sync.cc new file mode 100644 index 0000000000..f7ded0d606 --- /dev/null +++ b/projects/hip-tests/catch/unit/rtc/shfl_sync.cc @@ -0,0 +1,161 @@ +/* +Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "shfl.hh" + +static constexpr auto shfl { + R"( +template +__global__ void shflUpSum(T* a, int size) { + T val = a[threadIdx.x]; + auto all_threads = __activemask(); + for (unsigned int i = size / 2; i > 0; i /= 2) { + val += __shfl_up_sync(all_threads, val, i, size); + } + a[threadIdx.x] = val; +} + +template +__global__ void shflDownSum(T* a, int size) { + T val = a[threadIdx.x]; + auto all_threads = __activemask(); + for (int i = size / 2; i > 0; i /= 2) { + val += __shfl_down_sync(all_threads, val, i, size); + } + a[threadIdx.x] = val; +} + +template +__global__ void shflXorSum(T* a, int size) { + T val = a[threadIdx.x]; + auto all_threads = __activemask(); + for (int i = size/2; i > 0; i /= 2) { + val += __shfl_xor_sync(all_threads, val, i, size); + } + a[threadIdx.x] = val; +} +)"}; + +template +void runTestShflSync(int option) { + using namespace std; + hiprtcProgram prog; + hiprtcCreateProgram(&prog, // prog + shfl, // buffer + "shfl_sync.cu", // name + 0, nullptr, nullptr); + + string str; + switch(option) { + case 1: + str = "shflUpSum<__half>"; break; + case 2: + str = "shflDownSum<__half>"; break; + case 3: + str = "shflXorSum<__half>"; break; + default: + INFO("Options 1,2,3 are supported, but the passed option is: " << option); + REQUIRE(false); + } + + hiprtcAddNameExpression(prog, str.c_str()); + const char* options[] = { "-DHIP_ENABLE_WARP_SYNC_BUILTINS" }; + hiprtcResult compileResult{hiprtcCompileProgram(prog, 1, options)}; + size_t logSize; + HIPRTC_CHECK(hiprtcGetProgramLogSize(prog, &logSize)); + if (logSize) { + string log(logSize, '\0'); + HIPRTC_CHECK(hiprtcGetProgramLog(prog, &log[0])); + std::cout << log << '\n'; + } + REQUIRE(compileResult == HIPRTC_SUCCESS); + size_t codeSize; + HIPRTC_CHECK(hiprtcGetCodeSize(prog, &codeSize)); + + vector code(codeSize); + HIPRTC_CHECK(hiprtcGetCode(prog, code.data())); + + // Do hip malloc first so that we donot need to do a cuInit manually before calling hipModule APIs + size_t bufferSize = n * sizeof(T); + + T a[n]; + T cpuSum = sum(a); + T* d_a; + HIP_CHECK(hipMalloc(&d_a, bufferSize)); + + hipModule_t module; + hipFunction_t kernel; + HIP_CHECK(hipModuleLoadData(&module, code.data())); + const char* name; + hiprtcGetLoweredName(prog, str.c_str(), &name); + HIP_CHECK(hipModuleGetFunction(&kernel, module, name)); + + HIP_CHECK(hipMemcpy(d_a, &a, bufferSize, hipMemcpyDefault)); + + struct { + T* a_; + int b_; + } args{d_a, n}; + + auto size = sizeof(args); + void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, + HIP_LAUNCH_PARAM_END}; + + HIP_CHECK(hipModuleLaunchKernel(kernel, 1, 1, 1, n, 1, 1, 0, nullptr, nullptr, config)); + + HIP_CHECK(hipMemcpy(&a, d_a, bufferSize, hipMemcpyDefault)); + bool result; + switch (option) { + case 1: //shflUpSum + result = compare(a[n - 1], cpuSum); break; + case 2: //shflDownSum + case 3: //shflXorSum + result = compare(a[0], cpuSum); break; + } + + if (result) { + HIP_CHECK(hipFree(d_a)); + REQUIRE(false); + } + + HIP_CHECK(hipFree(d_a)); + HIP_CHECK(hipModuleUnload(module)); + HIPRTC_CHECK(hiprtcDestroyProgram(&prog)); + +} + +TEST_CASE("Unit_hiprtc_half_shuffle_sync") { + runTestShflSync<__half>(1); + runTestShflSync<__half>(2); + runTestShflSync<__half>(3); +} diff --git a/projects/hip-tests/catch/unit/warp/CMakeLists.txt b/projects/hip-tests/catch/unit/warp/CMakeLists.txt index 5fded69b55..1cab1f2080 100644 --- a/projects/hip-tests/catch/unit/warp/CMakeLists.txt +++ b/projects/hip-tests/catch/unit/warp/CMakeLists.txt @@ -11,6 +11,18 @@ if(HIP_PLATFORM MATCHES "amd") warp_shfl_xor.cc warp_shfl_up.cc warp_shfl_down.cc + hipShflUpDownTest.cc + hipShflTests.cc + # FIXME: The tests for sync intrinsics are temporarily disabled on CUDA + # because they depend on __match_any_sync, which is not available on older + # NVIDIA devices. (SWDEV-453145) + hipMatchSyncAllTests.cc + hipMatchSyncAnyTests.cc + hipShflSyncDownTests.cc + hipShflSyncUpTests.cc + hipShflSyncXorTests.cc + hipShflSyncTests.cc + hipVoteSyncTests.cc ) endif() diff --git a/projects/hip-tests/catch/unit/warp/hipMatchSyncAllTests.cc b/projects/hip-tests/catch/unit/warp/hipMatchSyncAllTests.cc new file mode 100644 index 0000000000..dddf7959d4 --- /dev/null +++ b/projects/hip-tests/catch/unit/warp/hipMatchSyncAllTests.cc @@ -0,0 +1,281 @@ +/* +Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "warp_common.hh" +#include + +template +__global__ void matchAll_1(T *Input, unsigned long long *Output, int *Predicate) { + auto tid = threadIdx.x; + + Output[tid] = __match_all_sync(AllThreads, Input[tid], &Predicate[tid]); +} + +template +static void runTestMatchAll_1() { + const int size = 64; + T Input[size] = {(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,}; + unsigned long long Output[size]; + unsigned long long Expected[size]; + std::fill_n(Expected, size, -1); + + int Predicate[size]; + int ExpPredicate[size]; + std::fill_n(ExpPredicate, size, true); + + expandPrecision(Input, size); + + int warpSize = getWarpSize(); + + T* d_Input; + unsigned long long* d_Output; + int *d_Predicate; + HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&d_Output, 8 * size)); + HIP_CHECK(hipMalloc(&d_Predicate, 4 * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(matchAll_1, 1, warpSize, 0, 0, d_Input, d_Output, d_Predicate); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareMaskEqual(Output, Expected, i, warpSize)); + } + + HIP_CHECK(hipMemcpy(&Predicate, d_Predicate, 4 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(Predicate[i] == ExpPredicate[i]); + } +} + +template +__global__ void matchAll_2(T *Input, unsigned long long *Output, int *Predicate) { + auto tid = threadIdx.x; + + Output[tid] = __match_all_sync(AllThreads, Input[tid], &Predicate[tid]); +} + +template +static void runTestMatchAll_2() { + const int size = 64; + T Input[size] = {(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-500, (T)-5, (T)-5, (T)-5, (T)-5,}; + unsigned long long Output[size]; + unsigned long long Expected[size]; + + int warpSize = getWarpSize(); + + if (warpSize == 32) + std::fill_n(Expected, size, -1); + else + std::fill_n(Expected, size, 0); + + int Predicate[size]; + int ExpPredicate[size]; + if (warpSize == 32) + std::fill_n(ExpPredicate, size, true); + else + std::fill_n(ExpPredicate, size, false); + + expandPrecision(Input, size); + + T* d_Input; + unsigned long long* d_Output; + int *d_Predicate; + HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&d_Output, 8 * size)); + HIP_CHECK(hipMalloc(&d_Predicate, 4 * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(matchAll_2, 1, warpSize, 0, 0, d_Input, d_Output, d_Predicate); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareMaskEqual(Output, Expected, i, warpSize)); + } + + HIP_CHECK(hipMemcpy(&Predicate, d_Predicate, 4 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(Predicate[i] == ExpPredicate[i]); + } +} + +template +__global__ void matchAll_3(T *Input, unsigned long long *Output, int *Predicate) { + auto tid = threadIdx.x; + // It's okay to use the non-sync__ match, because the purpose of the test is + // to exercise the mask argument on the sync version. + auto mask = __match_any_sync(AllThreads, tid / 12); + + Output[tid] = __match_all_sync(mask, Input[tid], &Predicate[tid]); +} + +template +static void runTestMatchAll_3() { + const int size = 64; + T Input[size] = {(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-500, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-500, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, + (T)-5, (T)-5, (T)-5, (T)-500, (T)-5, (T)-5, (T)-5, (T)-5,}; + unsigned long long Output[size]; + unsigned long long Expected[size] = { + 0xfff, 0xfff, 0xfff, 0xfff, + 0xfff, 0xfff, 0xfff, 0xfff, + 0xfff, 0xfff, 0xfff, 0xfff, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0xfff000000, 0xfff000000, 0xfff000000, 0xfff000000, + 0xfff000000, 0xfff000000, 0xfff000000, 0xfff000000, + 0xfff000000, 0xfff000000, 0xfff000000, 0xfff000000, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0xf000000000000000, 0xf000000000000000, 0xf000000000000000, 0xf000000000000000 + }; + + int Predicate[size]; + int ExpPredicate[size] { + true, true, true, true, + true, true, true, true, + true, true, true, true, + false, false, false, false, + false, false, false, false, + false, false, false, false, + true, true, true, true, + true, true, true, true, + true, true, true, true, + false, false, false, false, + false, false, false, false, + false, false, false, false, + false, false, false, false, + false, false, false, false, + false, false, false, false, + true, true, true, true + }; + + expandPrecision(Input, size); + + int warpSize = getWarpSize(); + + T* d_Input; + unsigned long long* d_Output; + int *d_Predicate; + HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&d_Output, 8 * size)); + HIP_CHECK(hipMalloc(&d_Predicate, 4 * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(matchAll_3, 1, warpSize, 0, 0, d_Input, d_Output, d_Predicate); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareMaskEqual(Output, Expected, i, warpSize)); + } + + HIP_CHECK(hipMemcpy(&Predicate, d_Predicate, 4 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(Predicate[i] == ExpPredicate[i]); + } +} + +/** + * @addtogroup __match_sync + * @{ + * @ingroup MatchSyncTest + * `unsigned long long __match_all_sync(unsigned long long mask, T value, int *pred)` - + * Contains warp __match sync functions. + * @} + */ + +/** + * Test Description + * ------------------------ + * - Test case to verify __match_sync warp functions for different datatypes. + + * Test source + * ------------------------ + * - catch/unit/kernel/hipMatchSyncAllTests.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ + +TEST_CASE("Unit_hipMatchSync_All") { + SECTION("run test for int") { + runTestMatchAll_1(); + runTestMatchAll_2(); + runTestMatchAll_3(); + } + SECTION("run test for unsigned int") { + runTestMatchAll_1(); + runTestMatchAll_2(); + runTestMatchAll_3(); + } + SECTION("run test for long") { + runTestMatchAll_1(); + runTestMatchAll_2(); + runTestMatchAll_3(); + } + SECTION("run test for unsigned long") { + runTestMatchAll_1(); + runTestMatchAll_2(); + runTestMatchAll_3(); + } + SECTION("run test for long long") { + runTestMatchAll_1(); + runTestMatchAll_2(); + runTestMatchAll_3(); + } + SECTION("run test for unsigned long long") { + runTestMatchAll_1(); + runTestMatchAll_2(); + runTestMatchAll_3(); + } + SECTION("run test for float") { + runTestMatchAll_1(); + runTestMatchAll_2(); + runTestMatchAll_3(); + } + SECTION("run test for double") { + runTestMatchAll_1(); + runTestMatchAll_2(); + runTestMatchAll_3(); + } +} diff --git a/projects/hip-tests/catch/unit/warp/hipMatchSyncAnyTests.cc b/projects/hip-tests/catch/unit/warp/hipMatchSyncAnyTests.cc new file mode 100644 index 0000000000..341d55619e --- /dev/null +++ b/projects/hip-tests/catch/unit/warp/hipMatchSyncAnyTests.cc @@ -0,0 +1,196 @@ +/* +Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "warp_common.hh" +#include + +template +__global__ void matchAny_1(T *Input, unsigned long long *Output) { + auto tid = threadIdx.x; + Output[tid] = __match_any_sync(AllThreads, Input[tid]); +} + +template +static void runTestMatchAny_1() { + const int size = 64; + T Input[size] = {0, 1, (T)-2, (T)-3, 4, 5, 6, (T)-7, + (T)-3, 4, 5, 6, (T)-7, 0, 1, (T)-2, + 4, 5, 6, (T)-7, 0, 1, (T)-2, (T)-3, + 6, (T)-7, 0, 1, (T)-2, (T)-3, 4, 5, + 0, 1, (T)-2, (T)-3, 4, 5, 6, (T)-7, + (T)-3, 4, 5, 6, (T)-7, 0, 1, (T)-2, + 4, 5, 6, (T)-7, 0, 1, (T)-2, (T)-3, + 6, (T)-7, 0, 1, (T)-2, (T)-3, 4, 5}; + unsigned long long Output[size]; + unsigned long long Expected[size] = { + 0x0410200104102001, 0x0820400208204002, 0x1040800410408004, 0x2080010820800108, + 0x4001021040010210, 0x8002042080020420, 0x0104084001040840, 0x0208108002081080, + 0x2080010820800108, 0x4001021040010210, 0x8002042080020420, 0x0104084001040840, + 0x0208108002081080, 0x0410200104102001, 0x0820400208204002, 0x1040800410408004, + 0x4001021040010210, 0x8002042080020420, 0x0104084001040840, 0x0208108002081080, + 0x0410200104102001, 0x0820400208204002, 0x1040800410408004, 0x2080010820800108, + 0x0104084001040840, 0x0208108002081080, 0x0410200104102001, 0x0820400208204002, + 0x1040800410408004, 0x2080010820800108, 0x4001021040010210, 0x8002042080020420, + 0x0410200104102001, 0x0820400208204002, 0x1040800410408004, 0x2080010820800108, + 0x4001021040010210, 0x8002042080020420, 0x0104084001040840, 0x0208108002081080, + 0x2080010820800108, 0x4001021040010210, 0x8002042080020420, 0x0104084001040840, + 0x0208108002081080, 0x0410200104102001, 0x0820400208204002, 0x1040800410408004, + 0x4001021040010210, 0x8002042080020420, 0x0104084001040840, 0x0208108002081080, + 0x0410200104102001, 0x0820400208204002, 0x1040800410408004, 0x2080010820800108, + 0x0104084001040840, 0x0208108002081080, 0x0410200104102001, 0x0820400208204002, + 0x1040800410408004, 0x2080010820800108, 0x4001021040010210, 0x8002042080020420 + }; + + expandPrecision(Input, size); + + int warpSize = getWarpSize(); + + T* d_Input; + unsigned long long* d_Output; + HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&d_Output, 8 * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(matchAny_1, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareMaskEqual(Output, Expected, i, warpSize)); + } +} + +template +__global__ void matchAny_2(T *Input, unsigned long long *Output) { + auto tid = threadIdx.x; + // It's okay to use the non-sync__ match, because the purpose of the test is + // to exercise the mask argument on the sync version. + auto mask = __match_any_sync(AllThreads, tid / 12); + Output[tid] = __match_any_sync(mask, Input[tid]); +} + +template +static void runTestMatchAny_2() { + const int size = 64; + T Input[size] = {0, 1, (T)-2, (T)-3, 4, 5, 6, (T)-7, + (T)-3, 4, 5, 6, (T)-7, 0, 1, (T)-2, + 4, 5, 6, (T)-7, 0, 1, (T)-2, (T)-3, + 6, (T)-7, 0, 1, (T)-2, (T)-3, 4, 5, + 0, 1, (T)-2, (T)-3, 4, 5, 6, (T)-7, + (T)-3, 4, 5, 6, (T)-7, 0, 1, (T)-2, + 4, 5, 6, (T)-7, 0, 1, (T)-2, (T)-3, + 6, (T)-7, 0, 1, (T)-2, (T)-3, 4, 5}; + unsigned long long Output[size]; + unsigned long long Expected[size] = { + 0x0000000000000001, 0x0000000000000002, 0x0000000000000004, 0x0000000000000108, + 0x0000000000000210, 0x0000000000000420, 0x0000000000000840, 0x0000000000000080, + 0x0000000000000108, 0x0000000000000210, 0x0000000000000420, 0x0000000000000840, + + 0x0000000000081000, 0x0000000000102000, 0x0000000000204000, 0x0000000000408000, + 0x0000000000010000, 0x0000000000020000, 0x0000000000040000, 0x0000000000081000, + 0x0000000000102000, 0x0000000000204000, 0x0000000000408000, 0x0000000000800000, + + 0x0000000001000000, 0x0000000002000000, 0x0000000104000000, 0x0000000208000000, + 0x0000000410000000, 0x0000000820000000, 0x0000000040000000, 0x0000000080000000, + 0x0000000104000000, 0x0000000208000000, 0x0000000410000000, 0x0000000820000000, + + 0x0000021000000000, 0x0000042000000000, 0x0000084000000000, 0x0000108000000000, + 0x0000010000000000, 0x0000021000000000, 0x0000042000000000, 0x0000084000000000, + 0x0000108000000000, 0x0000200000000000, 0x0000400000000000, 0x0000800000000000, + + 0x0001000000000000, 0x0002000000000000, 0x0104000000000000, 0x0208000000000000, + 0x0410000000000000, 0x0820000000000000, 0x0040000000000000, 0x0080000000000000, + 0x0104000000000000, 0x0208000000000000, 0x0410000000000000, 0x0820000000000000, + + 0x1000000000000000, 0x2000000000000000, 0x4000000000000000, 0x8000000000000000 + }; + + expandPrecision(Input, size); + + int warpSize = getWarpSize(); + + T* d_Input; + unsigned long long* d_Output; + HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&d_Output, 8 * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(matchAny_2, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareMaskEqual(Output, Expected, i, warpSize)); + } +} + +/** + * @addtogroup __match_sync + * @{ + * @ingroup MatchSyncTest + * `unsigned long long __match_any_sync(unsigned long long mask, T value, int *pred)` - + * Contains warp __match sync functions. + * @} + */ + +/** + * Test Description + * ------------------------ + * - Test case to verify __match_sync warp functions for different datatypes. + + * Test source + * ------------------------ + * - catch/unit/kernel/hipMatchSyncAnyTests.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ + +TEST_CASE("Unit_hipMatchSync_Any") { + SECTION("run test for int") { + runTestMatchAny_1(); + runTestMatchAny_2(); + } + SECTION("run test for unsigned int") { + runTestMatchAny_1(); + runTestMatchAny_2(); + } + SECTION("run test for long") { + runTestMatchAny_1(); + runTestMatchAny_2(); + } + SECTION("run test for unsigned long") { + runTestMatchAny_1(); + runTestMatchAny_2(); + } + SECTION("run test for long long") { + runTestMatchAny_1(); + runTestMatchAny_2(); + } + SECTION("run test for unsigned long long") { + runTestMatchAny_1(); + runTestMatchAny_2(); + } + SECTION("run test for float") { + runTestMatchAny_1(); + runTestMatchAny_2(); + } + SECTION("run test for double") { + runTestMatchAny_1(); + runTestMatchAny_2(); + } +} diff --git a/projects/hip-tests/catch/unit/warp/hipShflSyncDownTests.cc b/projects/hip-tests/catch/unit/warp/hipShflSyncDownTests.cc new file mode 100644 index 0000000000..75b8acddac --- /dev/null +++ b/projects/hip-tests/catch/unit/warp/hipShflSyncDownTests.cc @@ -0,0 +1,251 @@ +/* +Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "warp_common.hh" +#include + +// For all threads in the warp, shfl the value "down" by three threads. To +// account for the end of the warp, we set the delta to zero near the warp-32 +// boundary. This also works for warp-64 since it is a multiple. +template +__global__ void shflDown_1(T* Input, T *Output) { + auto tid = threadIdx.x; + int srcLane = (tid % 32 + 3 < 32) ? 3 : 0; + Output[tid] = __shfl_down_sync(AllThreads, Input[tid], srcLane); +} + +template +static void runTestShflDown_1() { + const int size = 64; + T Input[size]; + T Output[size]; + int Values[size] = {3, 4, 5, -6, 7, 8, -9, 10, + 11, 12, 13, -14, 15, 16, 17, -18, + 19, 20, -21, 22, 23, 24, 25, 26, + -27, 28, 29, 30, 31, 29, 30, 31, + 35, -36, 37, 38, -39, 40, 41, 42, + 43, -44, -45, 46, 47, 48, 49, 50, + -51, 52, 53, -54, 55, 56, 57, -58, + 59, 60, 61, 62, -63, 61, 62, -63}; + T Expected[size]; + + initializeInput(Input, size); + initializeExpected(Expected, Values, size); + + int warpSize = getWarpSize(); + + T* d_Input; + T* d_Output; + HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(shflDown_1, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareEqual(Output[i], Expected[i])); + } +} + +// Use the mask argument to divide the warp into groups of 12 threads, and then +// shfl "down" by three threads. Account for the boundary within a group as well +// as withing a warp-32. + +template +__global__ void shflDown_2(T* Input, T *Output) { + auto tid = threadIdx.x; + auto mask = __match_any_sync(AllThreads, tid / 12); + int srcLane = ((tid % 32 + 3 >= 32) || (tid % 12 + 3 >= 12)) ? 0 : 3; + Output[tid] = __shfl_down_sync(mask, Input[tid], srcLane); +} + +template +static void runTestShflDown_2() { + const int size = 64; + T Input[size]; + T Output[size]; + int Values[size] = {3, 4, 5, -6, 7, 8, -9, 10, + 11, -9, 10, 11, 15, 16, 17, -18, + 19, 20, -21, 22, 23, -21, 22, 23, + -27, 28, 29, 30, 31, 29, 30, 31, + 35, 33, 34, 35, -39, 40, 41, 42, + 43, -44, -45, 46, 47, -45, 46, 47, + -51, 52, 53, -54, 55, 56, 57, -58, + 59, 57, -58, 59, -63, 61, 62, -63}; + T Expected[size]; + + initializeInput(Input, size); + initializeExpected(Expected, Values, size); + + int warpSize = getWarpSize(); + + T* d_Input; + T* d_Output; + HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(shflDown_2, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareEqual(Output[i], Expected[i])); + } +} + +template +__global__ void shflDown_3(T* Input, T *Output) { + auto tid = threadIdx.x; + auto mask = __match_any_sync(AllThreads, tid / 12); + int srcLane = ((tid % 12 + 3 >= 12) || (tid % 8 + 3 >= 8)) ? 0 : 3; + Output[tid] = __shfl_down_sync(mask, Input[tid], srcLane, 8); +} + +template +static void runTestShflDown_3() { + const int size = 64; + T Input[size]; + T Output[size]; + int Values[size] = {3, 4, 5, -6, 7, + 5, -6, 7, // cannot cross 8 + 11, -9, 10, 11, // cannot cross 12 + 15, + 13, -14, 15, // cannot cross 8 + 19, 20, -21, 22, + 23, -21, 22, 23, // canot cross 12 + // pattern repeats + -27, 28, 29, 30, 31, + 29, 30, 31, + 35, 33, 34, 35, + -39, + 37, 38, -39, + 43, -44, -45, 46, + 47, -45, 46, 47, + // pattern repeats + -51, 52, 53, -54, 55, + 53, -54, 55, + 59, 57, -58, 59, + -63, + 61, 62, -63}; + T Expected[size]; + + initializeInput(Input, size); + initializeExpected(Expected, Values, size); + + int warpSize = getWarpSize(); + + T* d_Input; + T* d_Output; + HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(shflDown_3, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareEqual(Output[i], Expected[i])); + } +} + +/** + * @addtogroup __shfl_sync + * @{ + * @ingroup ShflSyncTest + * `T __shfl_down_sync(unsigned long long mask, T var, int delta, int width=warpSize)` - + * Contains warp __shfl sync functions. + * @} + */ + +/** + * Test Description + * ------------------------ + * - Test case to verify __shfl_down_sync warp functions for different datatypes. + + * Test source + * ------------------------ + * - catch/unit/kernel/hipShflSyncDownTests.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ + +TEST_CASE("Unit_hipShflSync_Down") { + SECTION("run test for short") { + runTestShflDown_1(); + runTestShflDown_2(); + runTestShflDown_3(); + } + SECTION("run test for unsigned short") { + runTestShflDown_1(); + runTestShflDown_2(); + runTestShflDown_3(); + } + SECTION("run test for int") { + runTestShflDown_1(); + runTestShflDown_2(); + runTestShflDown_3(); + } + SECTION("run test for unsigned int") { + runTestShflDown_1(); + runTestShflDown_2(); + runTestShflDown_3(); + } + SECTION("run test for long") { + runTestShflDown_1(); + runTestShflDown_2(); + runTestShflDown_3(); + } + SECTION("run test for unsigned long") { + runTestShflDown_1(); + runTestShflDown_2(); + runTestShflDown_3(); + } + SECTION("run test for long long") { + runTestShflDown_1(); + runTestShflDown_2(); + runTestShflDown_3(); + } + SECTION("run test for unsigned long long") { + runTestShflDown_1(); + runTestShflDown_2(); + runTestShflDown_3(); + } + SECTION("run test for float") { + runTestShflDown_1(); + runTestShflDown_2(); + runTestShflDown_3(); + } + SECTION("run test for double") { + runTestShflDown_1(); + runTestShflDown_2(); + runTestShflDown_3(); + } + SECTION("run test for __half") { + runTestShflDown_1<__half>(); + runTestShflDown_2<__half>(); + runTestShflDown_3<__half>(); + } + SECTION("run test for __half2") { + runTestShflDown_1<__half2>(); + runTestShflDown_2<__half2>(); + runTestShflDown_3<__half2>(); + } +} diff --git a/projects/hip-tests/catch/unit/warp/hipShflSyncTests.cc b/projects/hip-tests/catch/unit/warp/hipShflSyncTests.cc new file mode 100644 index 0000000000..1fb240edbc --- /dev/null +++ b/projects/hip-tests/catch/unit/warp/hipShflSyncTests.cc @@ -0,0 +1,176 @@ +/* +Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "warp_common.hh" +#include + +template +__global__ void shfl_1(T *Input, T *Output) { + int tid = threadIdx.x; + // Creates groups consisting of every fourth thread. + auto mask = __match_any_sync(AllThreads, tid % 4); + int srcLane = tid % 4; + + // Each group reads from the first active thread within that group. + Output[tid] = __shfl_sync(mask, Input[tid], srcLane); +} + +template +static void runTestShfl_1() { + const int size = 64; + T Input[size]; + T Output[size]; + T Expected[size]; + int Values[size] = {0, -1, 2, 3, 0, -1, 2, 3, + 0, -1, 2, 3, 0, -1, 2, 3, + 0, -1, 2, 3, 0, -1, 2, 3, + 0, -1, 2, 3, 0, -1, 2, 3, + 0, -1, 2, 3, 0, -1, 2, 3, + 0, -1, 2, 3, 0, -1, 2, 3, + 0, -1, 2, 3, 0, -1, 2, 3, + 0, -1, 2, 3, 0, -1, 2, 3}; + + initializeInput(Input, size); + initializeExpected(Expected, Values, size); + + int warpSize = getWarpSize(); + + T* d_Input; + T* d_Output; + HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(shfl_1, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareEqual(Output[i], Expected[i])); + } +} + +template +__global__ void shfl_2(T *Input, T *Output) { + int tid = threadIdx.x; + auto mask = __match_any_sync(AllThreads, tid % 4); + int srcLane = tid % 4; + + // Each subgroup of eight reads from the first active thread within that + // subgroup. + Output[tid] = __shfl_sync(mask, Input[tid], srcLane, 8); +} + +template +static void runTestShfl_2() { + const int size = 64; + T Input[size]; + T Output[size]; + T Expected[size]; + int Values[size] = {0, -1, 2, 3, 0, -1, 2, 3, + 8, -9, 10, 11, 8, -9, 10, 11, + 16, 17, -18, 19, 16, 17, -18, 19, + 24, 25, 26, -27, 24, 25, 26, -27, + -32, 33, 34, 35, -32, 33, 34, 35, + 40, 41, 42, 43, 40, 41, 42, 43, + 48, 49, 50, -51, 48, 49, 50, -51, + 56, 57, -58, 59, 56, 57, -58, 59}; + + initializeInput(Input, size); + initializeExpected(Expected, Values, size); + + int warpSize = getWarpSize(); + + T* d_Input; + T* d_Output; + HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(shfl_2, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareEqual(Output[i], Expected[i])); + } +} + +/** + * @addtogroup __shfl_sync + * @{ + * @ingroup ShflSyncTest + * `T __shfl_sync(unsigned long long mask, T var, int srcLane, int width=warpSize)` - + * Contains warp __shfl sync functions. + * @} + */ + +/** + * Test Description + * ------------------------ + * - Test case to verify __shfl_sync warp functions for different datatypes. + + * Test source + * ------------------------ + * - catch/unit/kernel/hipShflSyncTests.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ + +TEST_CASE("Unit_hipShflSync") { + SECTION("run test for short") { + runTestShfl_1(); + runTestShfl_2(); + } + SECTION("run test for unsigned short") { + runTestShfl_1(); + runTestShfl_2(); + } + SECTION("run test for int") { + runTestShfl_1(); + runTestShfl_2(); + } + SECTION("run test for unsigned int") { + runTestShfl_1(); + runTestShfl_2(); + } + SECTION("run test for long") { + runTestShfl_1(); + runTestShfl_2(); + } + SECTION("run test for unsigned long") { + runTestShfl_1(); + runTestShfl_2(); + } + SECTION("run test for long long") { + runTestShfl_1(); + runTestShfl_2(); + } + SECTION("run test for unsigned long long") { + runTestShfl_1(); + runTestShfl_2(); + } + SECTION("run test for float") { + runTestShfl_1(); + runTestShfl_2(); + } + SECTION("run test for double") { + runTestShfl_1(); + runTestShfl_2(); + } +} diff --git a/projects/hip-tests/catch/unit/warp/hipShflSyncUpTests.cc b/projects/hip-tests/catch/unit/warp/hipShflSyncUpTests.cc new file mode 100644 index 0000000000..586a44ab76 --- /dev/null +++ b/projects/hip-tests/catch/unit/warp/hipShflSyncUpTests.cc @@ -0,0 +1,243 @@ +/* +Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "warp_common.hh" +#include + +template +__global__ void shflUp_1(T* Input, T *Output) { + auto tid = threadIdx.x; + int srcLane = (tid > 3) ? 3 : 0; + Output[tid] = __shfl_up_sync(AllThreads, Input[tid], srcLane); +} + +template +static void runTestShflUp_1() { + const int size = 64; + T Input[size]; + T Output[size]; + T Expected[size]; + int Values[] = {0, -1, 2, 3, -1, 2, 3, 4, 5, -6, 7, 8, + -9, 10, 11, 12, 13, -14, 15, 16, 17, -18, 19, 20, + -21, 22, 23, 24, 25, 26, -27, 28, 29, 30, 31, -32, + 33, 34, 35, -36, 37, 38, -39, 40, 41, 42, 43, -44, + -45, 46, 47, 48, 49, 50, -51, 52, 53, -54, 55, 56, + 57, -58, 59, 60}; + + initializeInput(Input, size); + initializeExpected(Expected, Values, size); + + int warpSize = getWarpSize(); + + T* d_Input; + T* d_Output; + HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(shflUp_1, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareEqual(Output[i], Expected[i])); + } +} + +template +__global__ void shflUp_2(T* Input, T *Output) { + auto tid = threadIdx.x; + auto mask = __match_any_sync(AllThreads, tid / 12); + int srcLane = (tid % 12) < 3 ? 0 : 3; + Output[tid] = __shfl_up_sync(mask, Input[tid], srcLane); +} + +template +static void runTestShflUp_2() { + const int size = 64; + T Input[size]; + T Output[size]; + T Expected[size]; + int Values[size] = {0, -1, 2, 0, -1, 2, 3, 4, 5, -6, 7, 8, + 12, 13, -14, 12, 13, -14, 15, 16, 17, -18, 19, 20, + 24, 25, 26, 24, 25, 26, -27, 28, 29, 30, 31, -32, + -36, 37, 38, -36, 37, 38, -39, 40, 41, 42, 43, -44, + 48, 49, 50, 48, 49, 50, -51, 52, 53, -54, 55, 56, + 60, 61, 62, 60}; + + initializeInput(Input, size); + initializeExpected(Expected, Values, size); + + int warpSize = getWarpSize(); + + T* d_Input; + T* d_Output; + HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(shflUp_2, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareEqual(Output[i], Expected[i])); + } +} + +template +__global__ void shflUp_3(T* Input, T *Output) { + auto tid = threadIdx.x; + auto mask = __match_any_sync(AllThreads, tid / 12); + int srcLane = (tid % 12) < 3 ? 0 : 3; + Output[tid] = __shfl_up_sync(mask, Input[tid], srcLane, 8); +} + +template +static void runTestShflUp_3() { + const int size = 64; + T Input[size]; + T Output[size]; + T Expected[size]; + int Values[size] = {0, -1, 2, // cannot cross mod-12 + 0, -1, 2, 3, 4, + 8, -9, 10, // cannot cross mod-8 + 8, + 12, 13, -14, // cannot cross mod-12 + 12, + 16, 17, -18, // cannot cross mod-8 + 16, 17, -18, 19, 20, + // pattern repeats + 24, 25, 26, + 24, 25, 26, -27, 28, + -32, 33, 34, + -32, + -36, 37, 38, + -36, + 40, 41, 42, + 40, 41, 42, 43, -44, + // pattern repeats + 48, 49, 50, + 48, 49, 50, -51, 52, + 56, 57, -58, + 56, + 60, 61, 62, + 60}; + + initializeInput(Input, size); + initializeExpected(Expected, Values, size); + + int warpSize = getWarpSize(); + + T* d_Input; + T* d_Output; + HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(shflUp_3, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareEqual(Output[i], Expected[i])); + } +} + +/** + * @addtogroup __shfl_sync + * @{ + * @ingroup ShflSyncTest + * `T __shfl_up_sync(unsigned long long mask, T var, int delta, int width=warpSize)` - + * Contains warp __shfl sync functions. + * @} + */ + +/** + * Test Description + * ------------------------ + * - Test case to verify __shfl_up_sync warp functions for different datatypes. + + * Test source + * ------------------------ + * - catch/unit/kernel/hipShflSyncUpTests.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ + +TEST_CASE("Unit_hipShflSync_Up") { + SECTION("run test for short") { + runTestShflUp_1(); + runTestShflUp_2(); + runTestShflUp_3(); + } + SECTION("run test for unsigned short") { + runTestShflUp_1(); + runTestShflUp_2(); + runTestShflUp_3(); + } + SECTION("run test for int") { + runTestShflUp_1(); + runTestShflUp_2(); + runTestShflUp_3(); + } + SECTION("run test for unsigned int") { + runTestShflUp_1(); + runTestShflUp_2(); + runTestShflUp_3(); + } + SECTION("run test for long") { + runTestShflUp_1(); + runTestShflUp_2(); + runTestShflUp_3(); + } + SECTION("run test for unsigned long") { + runTestShflUp_1(); + runTestShflUp_2(); + runTestShflUp_3(); + } + SECTION("run test for long long") { + runTestShflUp_1(); + runTestShflUp_2(); + runTestShflUp_3(); + } + SECTION("run test for unsigned long long") { + runTestShflUp_1(); + runTestShflUp_2(); + runTestShflUp_3(); + } + SECTION("run test for float") { + runTestShflUp_1(); + runTestShflUp_2(); + runTestShflUp_3(); + } + SECTION("run test for double") { + runTestShflUp_1(); + runTestShflUp_2(); + runTestShflUp_3(); + } + SECTION("run test for __half") { + runTestShflUp_1<__half>(); + runTestShflUp_2<__half>(); + runTestShflUp_3<__half>(); + } + SECTION("run test for __half2") { + runTestShflUp_1<__half2>(); + runTestShflUp_2<__half2>(); + runTestShflUp_3<__half2>(); + } +} diff --git a/projects/hip-tests/catch/unit/warp/hipShflSyncXorTests.cc b/projects/hip-tests/catch/unit/warp/hipShflSyncXorTests.cc new file mode 100644 index 0000000000..9f07824168 --- /dev/null +++ b/projects/hip-tests/catch/unit/warp/hipShflSyncXorTests.cc @@ -0,0 +1,234 @@ +/* +Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "warp_common.hh" +#include + +template +__global__ void shflXor_1(T* Input, T *Output) { + auto tid = threadIdx.x; + Output[tid] = __shfl_xor_sync(AllThreads, Input[tid], 16); +} + +template +static void runTestShflXor_1() { + const int size = 64; + T Input[size]; + T Output[size]; + int Values[size] = {16, 17, -18, 19, 20, -21, 22, 23, + 24, 25, 26, -27, 28, 29, 30, 31, + 0, -1, 2, 3, 4, 5, -6, 7, + 8, -9, 10, 11, 12, 13, -14, 15, + 48, 49, 50, -51, 52, 53, -54, 55, + 56, 57, -58, 59, 60, 61, 62, -63, + -32, 33, 34, 35, -36, 37, 38, -39, + 40, 41, 42, 43, -44, -45, 46, 47}; + T Expected[size]; + + initializeInput(Input, size); + initializeExpected(Expected, Values, size); + + int warpSize = getWarpSize(); + + T* d_Input; + T* d_Output; + HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(shflXor_1, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareEqual(Output[i], Expected[i])); + } +} + +template +__global__ void shflXor_2(T* Input, T *Output) { + unsigned tid = threadIdx.x; + auto mask = __match_any_sync(AllThreads, tid / 12); + int laneMask = 4; + int section = tid % 24; + if (section > 7 && section < 16) + laneMask = 0; + Output[tid] = __shfl_xor_sync(mask, Input[tid], laneMask); +} + +template +static void runTestShflXor_2() { + const int size = 64; + T Input[size]; + T Output[size]; + int Values[size] = {4, 5, -6, 7, 0, -1, 2, 3, + 8, -9, 10, 11, 12, 13, -14, 15, // disabled around mid mod-24 + 20, -21, 22, 23, 16, 17, -18, 19, + 28, 29, 30, 31, 24, 25, 26, -27, + -32, 33, 34, 35, -36, 37, 38, -39, // disabled around mid mod-24 + -44, -45, 46, 47, 40, 41, 42, 43, + 52, 53, -54, 55, 48, 49, 50, -51, + 56, 57, -58, 59, 60, 61, 62, -63}; // disabled around mid mod-24 + T Expected[size]; + + initializeInput(Input, size); + initializeExpected(Expected, Values, size); + + + int warpSize = getWarpSize(); + + T* d_Input; + T* d_Output; + HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(shflXor_2, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareEqual(Output[i], Expected[i])); + } +} + +template +__global__ void shflXor_3(T* Input, T *Output) { + auto tid = threadIdx.x; + auto mask = __match_any_sync(AllThreads, tid / 16); + Output[tid] = __shfl_xor_sync(mask, Input[tid], 4, 8); +} + +template +static void runTestShflXor_3() { + const int size = 64; + T Input[size]; + T Output[size]; + int Values[size] = {4, 5, -6, 7, 0, -1, 2, 3, + 12, 13, -14, 15, 8, -9, 10, 11, + 20, -21, 22, 23, 16, 17, -18, 19, + 28, 29, 30, 31, 24, 25, 26, -27, + -36, 37, 38, -39, -32, 33, 34, 35, + -44, -45, 46, 47, 40, 41, 42, 43, + 52, 53, -54, 55, 48, 49, 50, -51, + 60, 61, 62, -63, 56, 57, -58, 59}; + T Expected[size]; + + initializeInput(Input, size); + initializeExpected(Expected, Values, size); + + + int warpSize = getWarpSize(); + + T* d_Input; + T* d_Output; + HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size)); + HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(shflXor_3, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareEqual(Output[i], Expected[i])); + } +} + +/** + * @addtogroup __shfl_sync + * @{ + * @ingroup ShflSyncTest + * `T __shfl_xor_sync(unsigned long long mask, T var, unsigned laneMask, int width=warpSize)` - + * Contains warp __shfl sync functions. + * @} + */ + +/** + * Test Description + * ------------------------ + * - Test case to verify __shfl_xor_sync warp functions for different datatypes. + + * Test source + * ------------------------ + * - catch/unit/kernel/hipShflSyncXorTests.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ + +TEST_CASE("Unit_hipShflSync_Xor") { + SECTION("run test for short") { + runTestShflXor_1(); + runTestShflXor_2(); + runTestShflXor_3(); + } + SECTION("run test for unsigned short") { + runTestShflXor_1(); + runTestShflXor_2(); + runTestShflXor_3(); + } + SECTION("run test for int") { + runTestShflXor_1(); + runTestShflXor_2(); + runTestShflXor_3(); + } + SECTION("run test for unsigned int") { + runTestShflXor_1(); + runTestShflXor_2(); + runTestShflXor_3(); + } + SECTION("run test for long") { + runTestShflXor_1(); + runTestShflXor_2(); + runTestShflXor_3(); + } + SECTION("run test for unsigned long") { + runTestShflXor_1(); + runTestShflXor_2(); + runTestShflXor_3(); + } + SECTION("run test for long long") { + runTestShflXor_1(); + runTestShflXor_2(); + runTestShflXor_3(); + } + SECTION("run test for unsigned long long") { + runTestShflXor_1(); + runTestShflXor_2(); + runTestShflXor_3(); + } + SECTION("run test for float") { + runTestShflXor_1(); + runTestShflXor_2(); + runTestShflXor_3(); + } + SECTION("run test for double") { + runTestShflXor_1(); + runTestShflXor_2(); + runTestShflXor_3(); + } + SECTION("run test for __half") { + runTestShflXor_1<__half>(); + runTestShflXor_2<__half>(); + runTestShflXor_3<__half>(); + } + SECTION("run test for __half2") { + runTestShflXor_1<__half2>(); + runTestShflXor_2<__half2>(); + runTestShflXor_3<__half2>(); + } +} diff --git a/projects/hip-tests/catch/unit/kernel/hipShflTests.cc b/projects/hip-tests/catch/unit/warp/hipShflTests.cc similarity index 100% rename from projects/hip-tests/catch/unit/kernel/hipShflTests.cc rename to projects/hip-tests/catch/unit/warp/hipShflTests.cc diff --git a/projects/hip-tests/catch/unit/kernel/hipShflUpDownTest.cc b/projects/hip-tests/catch/unit/warp/hipShflUpDownTest.cc similarity index 100% rename from projects/hip-tests/catch/unit/kernel/hipShflUpDownTest.cc rename to projects/hip-tests/catch/unit/warp/hipShflUpDownTest.cc diff --git a/projects/hip-tests/catch/unit/warp/hipVoteSyncTests.cc b/projects/hip-tests/catch/unit/warp/hipVoteSyncTests.cc new file mode 100644 index 0000000000..53cf56b594 --- /dev/null +++ b/projects/hip-tests/catch/unit/warp/hipVoteSyncTests.cc @@ -0,0 +1,595 @@ +/* +Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "warp_common.hh" +#include + +__global__ void any_1(int *Input, int *Output) { + auto tid = threadIdx.x; + Output[tid] = __any_sync(AllThreads, Input[tid]); +} + +static void runTestAny_1() { + const int size = 64; + int Input[size] = {0, }; + int Output[size]; + int Expected[size] = {0, }; + + int* d_Input; + int* d_Output; + HIP_CHECK(hipMalloc(&d_Input, 4 * size)); + HIP_CHECK(hipMalloc(&d_Output, 4 * size)); + + int warpSize = getWarpSize(); + + HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault)); + hipLaunchKernelGGL(any_1, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(Output[i] == Expected[i]); + } +} + +__global__ void any_2(int *Input, int *Output) { + auto tid = threadIdx.x; + Output[tid] = __any_sync(AllThreads, Input[tid]); +} + +static void runTestAny_2_w64() { + const int size = 64; + int Input[size] = {0, }; + int Output[size]; + int Expected[size] = {0, }; + + Input[60] = 1; + + int warpSize = getWarpSize(); + if (warpSize == 64) + std::fill_n(Expected, size, 1); + + int* d_Input; + int* d_Output; + HIP_CHECK(hipMalloc(&d_Input, 4 * size)); + HIP_CHECK(hipMalloc(&d_Output, 4 * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault)); + hipLaunchKernelGGL(any_2, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(Output[i] == Expected[i]); + } +} + +static void runTestAny_2_w32() { + const int size = 64; + int Input[size] = {0, }; + int Output[size]; + int Expected[size] = {0, }; + + Input[30] = 1; + std::fill_n(Expected, size, 1); + + int* d_Input; + int* d_Output; + HIP_CHECK(hipMalloc(&d_Input, 4 * size)); + HIP_CHECK(hipMalloc(&d_Output, 4 * size)); + + int warpSize = getWarpSize(); + + HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault)); + hipLaunchKernelGGL(any_2, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(Output[i] == Expected[i]); + } +} + +__global__ void any_3(int *Input, int *Output) { + auto tid = threadIdx.x; + auto mask = __match_any_sync(AllThreads, tid/12); + Output[tid] = __any_sync(mask, Input[tid]); +} + +static void runTestAny_3() { + const int size = 64; + int Input[size] = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 0}; + + int Output[size]; + int Expected[size] = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, + 1, 1, 1, 1}; + int* d_Input; + int* d_Output; + HIP_CHECK(hipMalloc(&d_Input, 4 * size)); + HIP_CHECK(hipMalloc(&d_Output, 4 * size)); + + int warpSize = getWarpSize(); + + HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault)); + hipLaunchKernelGGL(any_3, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(Output[i] == Expected[i]); + } +} + +__global__ void any_4(int *Input, int *Output) { + auto tid = threadIdx.x; + unsigned long long masks[2] = { Every5thBut9th, Every9thBit }; + + Output[tid] = -1; + if (tid % 5 == 0 || tid % 9 == 0) + Output[tid] = __any_sync(masks[tid % 9 == 0], Input[tid]); +} + +static void runTestAny_4() { + const int size = 64; + int Input[size] = {0, }; + Input[5] = 1; + + int Output[size]; + int Expected[size]; + + for (int i = 0; i != size; ++i) { + if (i % 9 == 0) { + Expected[i] = 0; + continue; + } + + if (i % 5 == 0) { + Expected[i] = 1; + continue; + } + + Expected[i] = -1; + } + + int* d_Input; + int* d_Output; + HIP_CHECK(hipMalloc(&d_Input, 4 * size)); + HIP_CHECK(hipMalloc(&d_Output, 4 * size)); + + int warpSize = getWarpSize(); + + HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault)); + hipLaunchKernelGGL(any_4, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(Output[i] == Expected[i]); + } +} + +__global__ void all_1(int *Input, int *Output) { + auto tid = threadIdx.x; + Output[tid] = __all_sync(AllThreads, Input[tid]); +} + +static void runTestAll_1_w64() { + const int size = 64; + int Input[size] = {1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 1, 1}; + + int Output[size]; + int Expected[size] = {0, }; + + int warpSize = getWarpSize(); + if (warpSize == 32) + std::fill_n(Expected, size, 1); + + int* d_Input; + int* d_Output; + HIP_CHECK(hipMalloc(&d_Input, 4 * size)); + HIP_CHECK(hipMalloc(&d_Output, 4 * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault)); + hipLaunchKernelGGL(all_1, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(Output[i] == Expected[i]); + } +} + +static void runTestAll_1_w32() { + const int size = 64; + int Input[size] = {1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1}; + + int Output[size]; + int Expected[size] = {0, }; + + int warpSize = getWarpSize(); + + int* d_Input; + int* d_Output; + HIP_CHECK(hipMalloc(&d_Input, 4 * size)); + HIP_CHECK(hipMalloc(&d_Output, 4 * size)); + + HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault)); + hipLaunchKernelGGL(all_1, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(Output[i] == Expected[i]); + } +} + +__global__ void all_2(int *Input, int *Output) { + auto tid = threadIdx.x; + Output[tid] = __all_sync(AllThreads, Input[tid]); +} + +static void runTestAll_2() { + const int size = 64; + int Input[size] = {1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1}; + + int Output[size]; + int Expected[size] = {1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1}; + + int* d_Input; + int* d_Output; + HIP_CHECK(hipMalloc(&d_Input, 4 * size)); + HIP_CHECK(hipMalloc(&d_Output, 4 * size)); + + int warpSize = getWarpSize(); + + HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault)); + hipLaunchKernelGGL(all_2, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(Output[i] == Expected[i]); + } +} + +__global__ void all_3(int *Input, int *Output) { + auto tid = threadIdx.x; + auto mask = __match_any_sync(AllThreads, tid/12); + Output[tid] = __all_sync(mask, Input[tid]); +} + +static void runTestAll_3() { + const int size = 64; + int Input[size] = {1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 1, 1}; + + int Output[size]; + int Expected[size] = {1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0}; + int* d_Input; + int* d_Output; + HIP_CHECK(hipMalloc(&d_Input, 4 * size)); + HIP_CHECK(hipMalloc(&d_Output, 4 * size)); + + int warpSize = getWarpSize(); + + HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault)); + hipLaunchKernelGGL(all_3, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(Output[i] == Expected[i]); + } +} + +__global__ void all_4(int *Input, int *Output) { + auto tid = threadIdx.x; + unsigned long long masks[2] = { Every5thBut9th, Every9thBit }; + + Output[tid] = -1; + if (tid % 5 == 0 || tid % 9 == 0) + Output[tid] = __all_sync(masks[tid % 9 == 0], Input[tid]); +} + +static void runTestAll_4() { + const int size = 64; + int Input[size]; + std::fill_n(Input, size, 1); + Input[5] = 0; + + int Output[size]; + int Expected[size]; + + for (int i = 0; i != size; ++i) { + if (i % 9 == 0) { + Expected[i] = 1; + continue; + } + + if (i % 5 == 0) { + Expected[i] = 0; + continue; + } + + Expected[i] = -1; + } + + int* d_Input; + int* d_Output; + HIP_CHECK(hipMalloc(&d_Input, 4 * size)); + HIP_CHECK(hipMalloc(&d_Output, 4 * size)); + + int warpSize = getWarpSize(); + + HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault)); + hipLaunchKernelGGL(all_4, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(Output[i] == Expected[i]); + } +} + +__global__ void ballot_1(int *Input, unsigned long long *Output) { + auto tid = threadIdx.x; + Output[tid] = __ballot_sync(AllThreads, Input[tid]); +} + +static void runTestBallot_1() { + const int size = 64; + int Input[size] = {0, 1, 0, 0, 1, 1, 1, 0, + 0, 1, 1, 1, 0, 0, 1, 0, + 1, 1, 1, 0, 0, 1, 0, 0, + 1, 0, 0, 1, 0, 0, 1, 1, + 0, 1, 0, 0, 1, 1, 1, 0, + 0, 1, 1, 1, 0, 0, 1, 0, + 1, 1, 1, 0, 0, 1, 0, 0, + 1, 0, 0, 1, 0, 0, 1, 1}; + unsigned long long Output[size]; + unsigned long long Expected[size] = { + 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, + 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, + 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, + 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, + 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, + 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, + 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, + 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, + 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, + 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, + 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, + 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, + 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, + 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, + 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, + 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72 + }; + + int* d_Input; + unsigned long long* d_Output; + HIP_CHECK(hipMalloc(&d_Input, 4 * size)); + HIP_CHECK(hipMalloc(&d_Output, 8 * size)); + + int warpSize = getWarpSize(); + + HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault)); + hipLaunchKernelGGL(ballot_1, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareMaskEqual(Output, Expected, i, warpSize)); + } +} + +__global__ void ballot_2(int *Input, unsigned long long *Output) { + auto tid = threadIdx.x; + auto mask = __match_any_sync(AllThreads, tid / 12); + Output[tid] = __ballot_sync(mask, Input[tid]); +} + +static void runTestBallot_2() { + const int size = 64; + int Input[size] = {0, 1, 0, 0, 1, 1, 1, 0, + 0, 1, 1, 1, 0, 0, 1, 0, + 1, 1, 1, 0, 0, 1, 0, 0, + 1, 0, 0, 1, 0, 0, 1, 1, + 0, 1, 0, 0, 1, 1, 1, 0, + 0, 1, 1, 1, 0, 0, 1, 0, + 1, 1, 1, 0, 0, 1, 0, 0, + 1, 0, 0, 1, 0, 0, 1, 1}; + unsigned long long Output[size]; + unsigned long long Expected[size] = { + 0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72, + 0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72, + 0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72, + 0x0000000000274000, 0x0000000000274000, 0x0000000000274000, 0x0000000000274000, + 0x0000000000274000, 0x0000000000274000, 0x0000000000274000, 0x0000000000274000, + 0x0000000000274000, 0x0000000000274000, 0x0000000000274000, 0x0000000000274000, + 0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000, + 0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000, + 0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000, + 0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000, + 0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000, + 0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000, + 0x0927000000000000, 0x0927000000000000, 0x0927000000000000, 0x0927000000000000, + 0x0927000000000000, 0x0927000000000000, 0x0927000000000000, 0x0927000000000000, + 0x0927000000000000, 0x0927000000000000, 0x0927000000000000, 0x0927000000000000, + 0xc000000000000000, 0xc000000000000000, 0xc000000000000000, 0xc000000000000000}; + + int* d_Input; + unsigned long long* d_Output; + HIP_CHECK(hipMalloc(&d_Input, 4 * size)); + HIP_CHECK(hipMalloc(&d_Output, 8 * size)); + + int warpSize = getWarpSize(); + + HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault)); + hipLaunchKernelGGL(ballot_2, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareMaskEqual(Output, Expected, i, warpSize)); + } +} + +__global__ void ballot_3(int *Input, unsigned long long *Output) { + auto tid = threadIdx.x; + unsigned long long masks[2] = { Every5thBut9th, Every9thBit }; + + Output[tid] = -1; + if (tid % 5 == 0 || tid % 9 == 0) + Output[tid] = __ballot_sync(masks[tid % 9 == 0], Input[tid]); +} + +static void runTestBallot_3() { + const int size = 64; + int Input[size]; + std::fill_n(Input, size, 1); + + unsigned long long Output[size]; + unsigned long long Expected[size]; + + for (int i = 0; i != size; ++i) { + if (i % 9 == 0) { + Expected[i] = Every9thBit; + continue; + } + + if (i % 5 == 0) { + Expected[i] = Every5thBut9th; + continue; + } + + Expected[i] = -1; + } + + int* d_Input; + unsigned long long* d_Output; + HIP_CHECK(hipMalloc(&d_Input, 4 * size)); + HIP_CHECK(hipMalloc(&d_Output, 8 * size)); + + int warpSize = getWarpSize(); + + HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault)); + hipLaunchKernelGGL(ballot_3, 1, warpSize, 0, 0, d_Input, d_Output); + + HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault)); + for (int i = 0; i != warpSize; ++i) { + REQUIRE(compareMaskEqual(Output, Expected, i, warpSize)); + } +} + +/** + * @addtogroup __vote_sync + * @{ + * @ingroup VoteSyncTest + * + * - `unsigned long long __any_sync(unsigned long long mask, int predicate)` + * - `unsigned long long __all_sync(unsigned long long mask, int predicate)` + * - `unsigned long long __ballot_sync(unsigned long long mask, int predicate)` + * + * Contains warp vote sync functions. + * @} + */ + +/** + * Test Description + * ------------------------ + * - Test cases to verify warp vote functions. + + * Test source + * ------------------------ + * - catch/unit/kernel/hipVoteSyncTests.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ + +TEST_CASE("Unit_hipVoteSync_Any") { + runTestAny_1(); + runTestAny_2_w64(); + runTestAny_2_w32(); + runTestAny_3(); + runTestAny_4(); +} + +TEST_CASE("Unit_hipVoteSync_All") { + runTestAll_1_w64(); + runTestAll_1_w32(); + runTestAll_2(); + runTestAll_3(); + runTestAll_4(); +} + +TEST_CASE("Unit_hipVoteSync_Ballot") { + runTestBallot_1(); + runTestBallot_2(); + runTestBallot_3(); +} diff --git a/projects/hip-tests/catch/unit/warp/warp_common.hh b/projects/hip-tests/catch/unit/warp/warp_common.hh index 15818366ef..e0870492f7 100644 --- a/projects/hip-tests/catch/unit/warp/warp_common.hh +++ b/projects/hip-tests/catch/unit/warp/warp_common.hh @@ -19,11 +19,18 @@ THE SOFTWARE. #pragma once +#define HIP_ENABLE_WARP_SYNC_BUILTINS + #include #include #include -static __device__ bool deactivate_thread(const uint64_t* const active_masks) { +const unsigned long long Every5thBit = 0x1084210842108421; +const unsigned long long Every9thBit = 0x8040201008040201; +const unsigned long long Every5thBut9th = Every5thBit & ~Every9thBit; +const unsigned long long AllThreads = ~0; + +inline __device__ bool deactivate_thread(const uint64_t* const active_masks) { const auto warp = cooperative_groups::tiled_partition(cooperative_groups::this_thread_block(), warpSize); const auto block = cooperative_groups::this_thread_block(); @@ -34,17 +41,17 @@ static __device__ bool deactivate_thread(const uint64_t* const active_masks) { return !(active_masks[idx] & (static_cast(1) << warp.thread_rank())); } -static inline std::mt19937& GetRandomGenerator() { +inline std::mt19937& GetRandomGenerator() { static std::mt19937 mt(std::random_device{}()); return mt; } -template static inline T GenerateRandomInteger(const T min, const T max) { +template inline T GenerateRandomInteger(const T min, const T max) { std::uniform_int_distribution dist(min, max); return dist(GetRandomGenerator()); } -template static inline T GenerateRandomReal(const T min, const T max) { +template inline T GenerateRandomReal(const T min, const T max) { std::uniform_real_distribution dist(min, max); return dist(GetRandomGenerator()); } @@ -83,3 +90,78 @@ inline uint64_t get_active_mask(unsigned int warp_id, unsigned int warp_size) { } return active_mask; } + +template ::value, bool> = true> +inline T expandPrecision(int X) { return X; } + +template ::value, bool> = true> +inline T expandPrecision(int X) { + return X * 3.141592653589793115997963468544185161590576171875; +} + +template ::value, bool> = true> +inline __half expandPrecision(int X) { + return (__half)expandPrecision(X); +} + +template ::value, bool> = true> +inline __half2 expandPrecision(int X) { + __half H = expandPrecision(X); + return {H, H}; +} + +template ::value, bool> = true> +inline void expandPrecision(T* Array, int size) { + (void)Array; + (void)size; +} + +template ::value, bool> = true> +inline void expandPrecision(T *Array, int size) { + for (int i = 0; i != size; ++i) { + Array[i] *= 3.141592653589793115997963468544185161590576171875; + } +} + +template +inline void initializeInput(T *Input, int size) { + int Values[] = {0, -1, 2, 3, 4, 5, -6, 7, + 8, -9, 10, 11, 12, 13, -14, 15, + 16, 17, -18, 19, 20, -21, 22, 23, + 24, 25, 26, -27, 28, 29, 30, 31, + -32, 33, 34, 35, -36, 37, 38, -39, + 40, 41, 42, 43, -44, -45, 46, 47, + 48, 49, 50, -51, 52, 53, -54, 55, + 56, 57, -58, 59, 60, 61, 62, -63}; + + for (int i = 0; i != size; ++i) { + Input[i] = expandPrecision(Values[i]); + } +} + +template +inline void initializeExpected(T *Expected, int *Values, int size) { + for (int i = 0; i != size; ++i) { + Expected[i] = expandPrecision(Values[i]); + } +} + +template +inline bool compareEqual(T X, T Y) { return X == Y; } + +template <> +inline bool compareEqual(__half X, __half Y) { + return __half2float(X) == __half2float(Y); +} + +template <> +inline bool compareEqual(__half2 X, __half2 Y) { + return compareEqual(X.x, Y.x) && compareEqual(X.y, Y.y); +} + +inline bool compareMaskEqual(unsigned long long *Actual, unsigned long long *Expected, + int i, int warpSize) { + if (warpSize == 32) + return (unsigned)Actual[i] == (unsigned)Expected[i]; + return Actual[i] == Expected[i]; +}