SWDEV-453422: introduce warp sync tests
The following builtins are tested:
- __all_sync, __any_sync, __ballot_sync and __activemask
- __match_any_sync and __match_all_sync
- __shfl_sync, __shfl_up_sync, __shfl_down_sync, and __shfl_xor_sync
The tests for shfl (all variants) were manually validated on a CUDA setup.
NOTE:
- Unit_hipVoteSync_All temporarily disabled on Windows (SWDEV-452308).
- All new tests temporarily disabled on CUDA (SWDEV-453145).
Change-Id: I84b205a88aa24219d199c760793e2f19f0cf8f13
[ROCm/hip-tests commit: 53f0a9bd01]
Dieser Commit ist enthalten in:
committet von
Rakesh Roy
Ursprung
a27e1a5aa1
Commit
ec1f43cd05
@@ -1435,6 +1435,8 @@
|
||||
"=== SWDEV-453453 : Below tests failed in stress test on 22/03/24 ===",
|
||||
"Unit_hipDeviceGetGraphMemAttribute_Functional",
|
||||
"Unit_hipDeviceGetGraphMemAttribute_Functional_Multi_Device",
|
||||
"=== SWDEV-452308 : Windows failure, should be fixed for ROCm 6.3. ===",
|
||||
"Unit_hipVoteSync_All",
|
||||
#endif
|
||||
"End of json"
|
||||
]
|
||||
|
||||
@@ -170,6 +170,21 @@ static void initHipCtx(hipCtx_t* pcontext) {
|
||||
#define HIP_ARRAY hipArray_t
|
||||
#endif
|
||||
|
||||
static inline int getWarpSize() {
|
||||
#if HT_NVIDIA
|
||||
return 32;
|
||||
#elif HT_AMD
|
||||
int device = -1;
|
||||
int warpSize = -1;
|
||||
HIP_CHECK(hipGetDevice(&device));
|
||||
HIP_CHECK(hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, device));
|
||||
return warpSize;
|
||||
#else
|
||||
std::cout<<"Have to be either Nvidia or AMD platform, asserting"<<std::endl;
|
||||
assert(false);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool IsGfx11() {
|
||||
#if HT_NVIDIA
|
||||
return false;
|
||||
|
||||
@@ -22,7 +22,6 @@
|
||||
set(TEST_SRC
|
||||
hipMemFaultStackAllocation.cc
|
||||
hipLaunchBounds.cc
|
||||
hipShflTests.cc
|
||||
hipDynamicShared.cc
|
||||
hipDynamicShared2.cc
|
||||
hipEmptyKernel.cc
|
||||
@@ -43,7 +42,6 @@ endif()
|
||||
# only for AMD
|
||||
if(HIP_PLATFORM MATCHES "amd")
|
||||
set(AMD_SRC
|
||||
hipShflUpDownTest.cc
|
||||
hipExtLaunchKernelGGL.cc
|
||||
)
|
||||
set(TEST_SRC ${TEST_SRC} ${AMD_SRC})
|
||||
|
||||
@@ -15,6 +15,7 @@ set(AMD_TEST_SRC
|
||||
hipRtcBfloat16.cc
|
||||
linker.cc
|
||||
shfl.cc
|
||||
shfl_sync.cc
|
||||
stdheaders.cc
|
||||
hiprtc_MathConstants_HeaderTst.cc
|
||||
hiprtc_VectorTypes_HeaderTst.cc
|
||||
|
||||
@@ -31,8 +31,7 @@ THE SOFTWARE.
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <vector>
|
||||
|
||||
static constexpr int n = 32;
|
||||
#include "shfl.hh"
|
||||
|
||||
static constexpr auto shfl {
|
||||
R"(
|
||||
@@ -63,45 +62,6 @@ __global__ void shflXorSum(T* a, int size) {
|
||||
}
|
||||
)"};
|
||||
|
||||
void getFactor(int& fact) { fact = 101; }
|
||||
void getFactor(__half& fact) { fact = 2.5; }
|
||||
|
||||
template <typename T> T sum(T* a) {
|
||||
T cpuSum = 0;
|
||||
T factor;
|
||||
getFactor(factor);
|
||||
for (int i = 0; i < n; i++) {
|
||||
a[i] = i + factor;
|
||||
cpuSum += a[i];
|
||||
}
|
||||
return cpuSum;
|
||||
}
|
||||
|
||||
template <typename T> bool compare(T gpuSum, T cpuSum) {
|
||||
if (gpuSum != cpuSum) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <> __half sum(__half* a) {
|
||||
__half cpuSum = 0;
|
||||
__half factor;
|
||||
getFactor(factor);
|
||||
for (int i = 0; i < n; i++) {
|
||||
a[i] = i + __half2float(factor);
|
||||
cpuSum = __half2float(cpuSum) + __half2float(a[i]);
|
||||
}
|
||||
return cpuSum;
|
||||
}
|
||||
|
||||
template <> bool compare(__half gpuSum, __half cpuSum) {
|
||||
if (__half2float(gpuSum) != __half2float(cpuSum)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void runTestShfl(int option) {
|
||||
using namespace std;
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
static constexpr int n = 32;
|
||||
|
||||
inline void getFactor(int& fact) { fact = 101; }
|
||||
inline void getFactor(__half& fact) { fact = 2.5; }
|
||||
|
||||
template <typename T> inline T sum(T* a) {
|
||||
T cpuSum = 0;
|
||||
T factor;
|
||||
getFactor(factor);
|
||||
for (int i = 0; i < n; i++) {
|
||||
a[i] = i + factor;
|
||||
cpuSum += a[i];
|
||||
}
|
||||
return cpuSum;
|
||||
}
|
||||
|
||||
template <typename T> inline bool compare(T gpuSum, T cpuSum) {
|
||||
if (gpuSum != cpuSum) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <> inline __half sum(__half* a) {
|
||||
__half cpuSum = 0;
|
||||
__half factor;
|
||||
getFactor(factor);
|
||||
for (int i = 0; i < n; i++) {
|
||||
a[i] = i + __half2float(factor);
|
||||
cpuSum = __half2float(cpuSum) + __half2float(a[i]);
|
||||
}
|
||||
return cpuSum;
|
||||
}
|
||||
|
||||
template <> inline bool compare(__half gpuSum, __half cpuSum) {
|
||||
if (__half2float(gpuSum) != __half2float(cpuSum)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -0,0 +1,161 @@
|
||||
/*
|
||||
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
#include <hip/hiprtc.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <memory>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <vector>
|
||||
#include "shfl.hh"
|
||||
|
||||
static constexpr auto shfl {
|
||||
R"(
|
||||
template <typename T>
|
||||
__global__ void shflUpSum(T* a, int size) {
|
||||
T val = a[threadIdx.x];
|
||||
auto all_threads = __activemask();
|
||||
for (unsigned int i = size / 2; i > 0; i /= 2) {
|
||||
val += __shfl_up_sync(all_threads, val, i, size);
|
||||
}
|
||||
a[threadIdx.x] = val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void shflDownSum(T* a, int size) {
|
||||
T val = a[threadIdx.x];
|
||||
auto all_threads = __activemask();
|
||||
for (int i = size / 2; i > 0; i /= 2) {
|
||||
val += __shfl_down_sync(all_threads, val, i, size);
|
||||
}
|
||||
a[threadIdx.x] = val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void shflXorSum(T* a, int size) {
|
||||
T val = a[threadIdx.x];
|
||||
auto all_threads = __activemask();
|
||||
for (int i = size/2; i > 0; i /= 2) {
|
||||
val += __shfl_xor_sync(all_threads, val, i, size);
|
||||
}
|
||||
a[threadIdx.x] = val;
|
||||
}
|
||||
)"};
|
||||
|
||||
template <typename T>
|
||||
void runTestShflSync(int option) {
|
||||
using namespace std;
|
||||
hiprtcProgram prog;
|
||||
hiprtcCreateProgram(&prog, // prog
|
||||
shfl, // buffer
|
||||
"shfl_sync.cu", // name
|
||||
0, nullptr, nullptr);
|
||||
|
||||
string str;
|
||||
switch(option) {
|
||||
case 1:
|
||||
str = "shflUpSum<__half>"; break;
|
||||
case 2:
|
||||
str = "shflDownSum<__half>"; break;
|
||||
case 3:
|
||||
str = "shflXorSum<__half>"; break;
|
||||
default:
|
||||
INFO("Options 1,2,3 are supported, but the passed option is: " << option);
|
||||
REQUIRE(false);
|
||||
}
|
||||
|
||||
hiprtcAddNameExpression(prog, str.c_str());
|
||||
const char* options[] = { "-DHIP_ENABLE_WARP_SYNC_BUILTINS" };
|
||||
hiprtcResult compileResult{hiprtcCompileProgram(prog, 1, options)};
|
||||
size_t logSize;
|
||||
HIPRTC_CHECK(hiprtcGetProgramLogSize(prog, &logSize));
|
||||
if (logSize) {
|
||||
string log(logSize, '\0');
|
||||
HIPRTC_CHECK(hiprtcGetProgramLog(prog, &log[0]));
|
||||
std::cout << log << '\n';
|
||||
}
|
||||
REQUIRE(compileResult == HIPRTC_SUCCESS);
|
||||
size_t codeSize;
|
||||
HIPRTC_CHECK(hiprtcGetCodeSize(prog, &codeSize));
|
||||
|
||||
vector<char> code(codeSize);
|
||||
HIPRTC_CHECK(hiprtcGetCode(prog, code.data()));
|
||||
|
||||
// Do hip malloc first so that we donot need to do a cuInit manually before calling hipModule APIs
|
||||
size_t bufferSize = n * sizeof(T);
|
||||
|
||||
T a[n];
|
||||
T cpuSum = sum(a);
|
||||
T* d_a;
|
||||
HIP_CHECK(hipMalloc(&d_a, bufferSize));
|
||||
|
||||
hipModule_t module;
|
||||
hipFunction_t kernel;
|
||||
HIP_CHECK(hipModuleLoadData(&module, code.data()));
|
||||
const char* name;
|
||||
hiprtcGetLoweredName(prog, str.c_str(), &name);
|
||||
HIP_CHECK(hipModuleGetFunction(&kernel, module, name));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_a, &a, bufferSize, hipMemcpyDefault));
|
||||
|
||||
struct {
|
||||
T* a_;
|
||||
int b_;
|
||||
} args{d_a, n};
|
||||
|
||||
auto size = sizeof(args);
|
||||
void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size,
|
||||
HIP_LAUNCH_PARAM_END};
|
||||
|
||||
HIP_CHECK(hipModuleLaunchKernel(kernel, 1, 1, 1, n, 1, 1, 0, nullptr, nullptr, config));
|
||||
|
||||
HIP_CHECK(hipMemcpy(&a, d_a, bufferSize, hipMemcpyDefault));
|
||||
bool result;
|
||||
switch (option) {
|
||||
case 1: //shflUpSum
|
||||
result = compare(a[n - 1], cpuSum); break;
|
||||
case 2: //shflDownSum
|
||||
case 3: //shflXorSum
|
||||
result = compare(a[0], cpuSum); break;
|
||||
}
|
||||
|
||||
if (result) {
|
||||
HIP_CHECK(hipFree(d_a));
|
||||
REQUIRE(false);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipFree(d_a));
|
||||
HIP_CHECK(hipModuleUnload(module));
|
||||
HIPRTC_CHECK(hiprtcDestroyProgram(&prog));
|
||||
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hiprtc_half_shuffle_sync") {
|
||||
runTestShflSync<__half>(1);
|
||||
runTestShflSync<__half>(2);
|
||||
runTestShflSync<__half>(3);
|
||||
}
|
||||
@@ -11,6 +11,18 @@ if(HIP_PLATFORM MATCHES "amd")
|
||||
warp_shfl_xor.cc
|
||||
warp_shfl_up.cc
|
||||
warp_shfl_down.cc
|
||||
hipShflUpDownTest.cc
|
||||
hipShflTests.cc
|
||||
# FIXME: The tests for sync intrinsics are temporarily disabled on CUDA
|
||||
# because they depend on __match_any_sync, which is not available on older
|
||||
# NVIDIA devices. (SWDEV-453145)
|
||||
hipMatchSyncAllTests.cc
|
||||
hipMatchSyncAnyTests.cc
|
||||
hipShflSyncDownTests.cc
|
||||
hipShflSyncUpTests.cc
|
||||
hipShflSyncXorTests.cc
|
||||
hipShflSyncTests.cc
|
||||
hipVoteSyncTests.cc
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
@@ -0,0 +1,281 @@
|
||||
/*
|
||||
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "warp_common.hh"
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
template <typename T>
|
||||
__global__ void matchAll_1(T *Input, unsigned long long *Output, int *Predicate) {
|
||||
auto tid = threadIdx.x;
|
||||
|
||||
Output[tid] = __match_all_sync(AllThreads, Input[tid], &Predicate[tid]);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void runTestMatchAll_1() {
|
||||
const int size = 64;
|
||||
T Input[size] = {(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,};
|
||||
unsigned long long Output[size];
|
||||
unsigned long long Expected[size];
|
||||
std::fill_n(Expected, size, -1);
|
||||
|
||||
int Predicate[size];
|
||||
int ExpPredicate[size];
|
||||
std::fill_n(ExpPredicate, size, true);
|
||||
|
||||
expandPrecision(Input, size);
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
T* d_Input;
|
||||
unsigned long long* d_Output;
|
||||
int *d_Predicate;
|
||||
HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 8 * size));
|
||||
HIP_CHECK(hipMalloc(&d_Predicate, 4 * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(matchAll_1<T>, 1, warpSize, 0, 0, d_Input, d_Output, d_Predicate);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareMaskEqual(Output, Expected, i, warpSize));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Predicate, d_Predicate, 4 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(Predicate[i] == ExpPredicate[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void matchAll_2(T *Input, unsigned long long *Output, int *Predicate) {
|
||||
auto tid = threadIdx.x;
|
||||
|
||||
Output[tid] = __match_all_sync(AllThreads, Input[tid], &Predicate[tid]);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void runTestMatchAll_2() {
|
||||
const int size = 64;
|
||||
T Input[size] = {(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-500, (T)-5, (T)-5, (T)-5, (T)-5,};
|
||||
unsigned long long Output[size];
|
||||
unsigned long long Expected[size];
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
if (warpSize == 32)
|
||||
std::fill_n(Expected, size, -1);
|
||||
else
|
||||
std::fill_n(Expected, size, 0);
|
||||
|
||||
int Predicate[size];
|
||||
int ExpPredicate[size];
|
||||
if (warpSize == 32)
|
||||
std::fill_n(ExpPredicate, size, true);
|
||||
else
|
||||
std::fill_n(ExpPredicate, size, false);
|
||||
|
||||
expandPrecision(Input, size);
|
||||
|
||||
T* d_Input;
|
||||
unsigned long long* d_Output;
|
||||
int *d_Predicate;
|
||||
HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 8 * size));
|
||||
HIP_CHECK(hipMalloc(&d_Predicate, 4 * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(matchAll_2<T>, 1, warpSize, 0, 0, d_Input, d_Output, d_Predicate);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareMaskEqual(Output, Expected, i, warpSize));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Predicate, d_Predicate, 4 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(Predicate[i] == ExpPredicate[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void matchAll_3(T *Input, unsigned long long *Output, int *Predicate) {
|
||||
auto tid = threadIdx.x;
|
||||
// It's okay to use the non-sync__ match, because the purpose of the test is
|
||||
// to exercise the mask argument on the sync version.
|
||||
auto mask = __match_any_sync(AllThreads, tid / 12);
|
||||
|
||||
Output[tid] = __match_all_sync(mask, Input[tid], &Predicate[tid]);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void runTestMatchAll_3() {
|
||||
const int size = 64;
|
||||
T Input[size] = {(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-500, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-500, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5, (T)-5,
|
||||
(T)-5, (T)-5, (T)-5, (T)-500, (T)-5, (T)-5, (T)-5, (T)-5,};
|
||||
unsigned long long Output[size];
|
||||
unsigned long long Expected[size] = {
|
||||
0xfff, 0xfff, 0xfff, 0xfff,
|
||||
0xfff, 0xfff, 0xfff, 0xfff,
|
||||
0xfff, 0xfff, 0xfff, 0xfff,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0xfff000000, 0xfff000000, 0xfff000000, 0xfff000000,
|
||||
0xfff000000, 0xfff000000, 0xfff000000, 0xfff000000,
|
||||
0xfff000000, 0xfff000000, 0xfff000000, 0xfff000000,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0xf000000000000000, 0xf000000000000000, 0xf000000000000000, 0xf000000000000000
|
||||
};
|
||||
|
||||
int Predicate[size];
|
||||
int ExpPredicate[size] {
|
||||
true, true, true, true,
|
||||
true, true, true, true,
|
||||
true, true, true, true,
|
||||
false, false, false, false,
|
||||
false, false, false, false,
|
||||
false, false, false, false,
|
||||
true, true, true, true,
|
||||
true, true, true, true,
|
||||
true, true, true, true,
|
||||
false, false, false, false,
|
||||
false, false, false, false,
|
||||
false, false, false, false,
|
||||
false, false, false, false,
|
||||
false, false, false, false,
|
||||
false, false, false, false,
|
||||
true, true, true, true
|
||||
};
|
||||
|
||||
expandPrecision(Input, size);
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
T* d_Input;
|
||||
unsigned long long* d_Output;
|
||||
int *d_Predicate;
|
||||
HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 8 * size));
|
||||
HIP_CHECK(hipMalloc(&d_Predicate, 4 * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(matchAll_3<T>, 1, warpSize, 0, 0, d_Input, d_Output, d_Predicate);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareMaskEqual(Output, Expected, i, warpSize));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Predicate, d_Predicate, 4 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(Predicate[i] == ExpPredicate[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @addtogroup __match_sync
|
||||
* @{
|
||||
* @ingroup MatchSyncTest
|
||||
* `unsigned long long __match_all_sync(unsigned long long mask, T value, int *pred)` -
|
||||
* Contains warp __match sync functions.
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Test case to verify __match_sync warp functions for different datatypes.
|
||||
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/kernel/hipMatchSyncAllTests.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
|
||||
TEST_CASE("Unit_hipMatchSync_All") {
|
||||
SECTION("run test for int") {
|
||||
runTestMatchAll_1<int>();
|
||||
runTestMatchAll_2<int>();
|
||||
runTestMatchAll_3<int>();
|
||||
}
|
||||
SECTION("run test for unsigned int") {
|
||||
runTestMatchAll_1<unsigned int>();
|
||||
runTestMatchAll_2<unsigned int>();
|
||||
runTestMatchAll_3<unsigned int>();
|
||||
}
|
||||
SECTION("run test for long") {
|
||||
runTestMatchAll_1<long>();
|
||||
runTestMatchAll_2<long>();
|
||||
runTestMatchAll_3<long>();
|
||||
}
|
||||
SECTION("run test for unsigned long") {
|
||||
runTestMatchAll_1<unsigned long>();
|
||||
runTestMatchAll_2<unsigned long>();
|
||||
runTestMatchAll_3<unsigned long>();
|
||||
}
|
||||
SECTION("run test for long long") {
|
||||
runTestMatchAll_1<long long>();
|
||||
runTestMatchAll_2<long long>();
|
||||
runTestMatchAll_3<long long>();
|
||||
}
|
||||
SECTION("run test for unsigned long long") {
|
||||
runTestMatchAll_1<unsigned long long>();
|
||||
runTestMatchAll_2<unsigned long long>();
|
||||
runTestMatchAll_3<unsigned long long>();
|
||||
}
|
||||
SECTION("run test for float") {
|
||||
runTestMatchAll_1<float>();
|
||||
runTestMatchAll_2<float>();
|
||||
runTestMatchAll_3<float>();
|
||||
}
|
||||
SECTION("run test for double") {
|
||||
runTestMatchAll_1<double>();
|
||||
runTestMatchAll_2<double>();
|
||||
runTestMatchAll_3<double>();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,196 @@
|
||||
/*
|
||||
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "warp_common.hh"
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
template <typename T>
|
||||
__global__ void matchAny_1(T *Input, unsigned long long *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
Output[tid] = __match_any_sync(AllThreads, Input[tid]);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void runTestMatchAny_1() {
|
||||
const int size = 64;
|
||||
T Input[size] = {0, 1, (T)-2, (T)-3, 4, 5, 6, (T)-7,
|
||||
(T)-3, 4, 5, 6, (T)-7, 0, 1, (T)-2,
|
||||
4, 5, 6, (T)-7, 0, 1, (T)-2, (T)-3,
|
||||
6, (T)-7, 0, 1, (T)-2, (T)-3, 4, 5,
|
||||
0, 1, (T)-2, (T)-3, 4, 5, 6, (T)-7,
|
||||
(T)-3, 4, 5, 6, (T)-7, 0, 1, (T)-2,
|
||||
4, 5, 6, (T)-7, 0, 1, (T)-2, (T)-3,
|
||||
6, (T)-7, 0, 1, (T)-2, (T)-3, 4, 5};
|
||||
unsigned long long Output[size];
|
||||
unsigned long long Expected[size] = {
|
||||
0x0410200104102001, 0x0820400208204002, 0x1040800410408004, 0x2080010820800108,
|
||||
0x4001021040010210, 0x8002042080020420, 0x0104084001040840, 0x0208108002081080,
|
||||
0x2080010820800108, 0x4001021040010210, 0x8002042080020420, 0x0104084001040840,
|
||||
0x0208108002081080, 0x0410200104102001, 0x0820400208204002, 0x1040800410408004,
|
||||
0x4001021040010210, 0x8002042080020420, 0x0104084001040840, 0x0208108002081080,
|
||||
0x0410200104102001, 0x0820400208204002, 0x1040800410408004, 0x2080010820800108,
|
||||
0x0104084001040840, 0x0208108002081080, 0x0410200104102001, 0x0820400208204002,
|
||||
0x1040800410408004, 0x2080010820800108, 0x4001021040010210, 0x8002042080020420,
|
||||
0x0410200104102001, 0x0820400208204002, 0x1040800410408004, 0x2080010820800108,
|
||||
0x4001021040010210, 0x8002042080020420, 0x0104084001040840, 0x0208108002081080,
|
||||
0x2080010820800108, 0x4001021040010210, 0x8002042080020420, 0x0104084001040840,
|
||||
0x0208108002081080, 0x0410200104102001, 0x0820400208204002, 0x1040800410408004,
|
||||
0x4001021040010210, 0x8002042080020420, 0x0104084001040840, 0x0208108002081080,
|
||||
0x0410200104102001, 0x0820400208204002, 0x1040800410408004, 0x2080010820800108,
|
||||
0x0104084001040840, 0x0208108002081080, 0x0410200104102001, 0x0820400208204002,
|
||||
0x1040800410408004, 0x2080010820800108, 0x4001021040010210, 0x8002042080020420
|
||||
};
|
||||
|
||||
expandPrecision(Input, size);
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
T* d_Input;
|
||||
unsigned long long* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 8 * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(matchAny_1<T>, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareMaskEqual(Output, Expected, i, warpSize));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void matchAny_2(T *Input, unsigned long long *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
// It's okay to use the non-sync__ match, because the purpose of the test is
|
||||
// to exercise the mask argument on the sync version.
|
||||
auto mask = __match_any_sync(AllThreads, tid / 12);
|
||||
Output[tid] = __match_any_sync(mask, Input[tid]);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void runTestMatchAny_2() {
|
||||
const int size = 64;
|
||||
T Input[size] = {0, 1, (T)-2, (T)-3, 4, 5, 6, (T)-7,
|
||||
(T)-3, 4, 5, 6, (T)-7, 0, 1, (T)-2,
|
||||
4, 5, 6, (T)-7, 0, 1, (T)-2, (T)-3,
|
||||
6, (T)-7, 0, 1, (T)-2, (T)-3, 4, 5,
|
||||
0, 1, (T)-2, (T)-3, 4, 5, 6, (T)-7,
|
||||
(T)-3, 4, 5, 6, (T)-7, 0, 1, (T)-2,
|
||||
4, 5, 6, (T)-7, 0, 1, (T)-2, (T)-3,
|
||||
6, (T)-7, 0, 1, (T)-2, (T)-3, 4, 5};
|
||||
unsigned long long Output[size];
|
||||
unsigned long long Expected[size] = {
|
||||
0x0000000000000001, 0x0000000000000002, 0x0000000000000004, 0x0000000000000108,
|
||||
0x0000000000000210, 0x0000000000000420, 0x0000000000000840, 0x0000000000000080,
|
||||
0x0000000000000108, 0x0000000000000210, 0x0000000000000420, 0x0000000000000840,
|
||||
|
||||
0x0000000000081000, 0x0000000000102000, 0x0000000000204000, 0x0000000000408000,
|
||||
0x0000000000010000, 0x0000000000020000, 0x0000000000040000, 0x0000000000081000,
|
||||
0x0000000000102000, 0x0000000000204000, 0x0000000000408000, 0x0000000000800000,
|
||||
|
||||
0x0000000001000000, 0x0000000002000000, 0x0000000104000000, 0x0000000208000000,
|
||||
0x0000000410000000, 0x0000000820000000, 0x0000000040000000, 0x0000000080000000,
|
||||
0x0000000104000000, 0x0000000208000000, 0x0000000410000000, 0x0000000820000000,
|
||||
|
||||
0x0000021000000000, 0x0000042000000000, 0x0000084000000000, 0x0000108000000000,
|
||||
0x0000010000000000, 0x0000021000000000, 0x0000042000000000, 0x0000084000000000,
|
||||
0x0000108000000000, 0x0000200000000000, 0x0000400000000000, 0x0000800000000000,
|
||||
|
||||
0x0001000000000000, 0x0002000000000000, 0x0104000000000000, 0x0208000000000000,
|
||||
0x0410000000000000, 0x0820000000000000, 0x0040000000000000, 0x0080000000000000,
|
||||
0x0104000000000000, 0x0208000000000000, 0x0410000000000000, 0x0820000000000000,
|
||||
|
||||
0x1000000000000000, 0x2000000000000000, 0x4000000000000000, 0x8000000000000000
|
||||
};
|
||||
|
||||
expandPrecision(Input, size);
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
T* d_Input;
|
||||
unsigned long long* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 8 * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(matchAny_2<T>, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareMaskEqual(Output, Expected, i, warpSize));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @addtogroup __match_sync
|
||||
* @{
|
||||
* @ingroup MatchSyncTest
|
||||
* `unsigned long long __match_any_sync(unsigned long long mask, T value, int *pred)` -
|
||||
* Contains warp __match sync functions.
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Test case to verify __match_sync warp functions for different datatypes.
|
||||
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/kernel/hipMatchSyncAnyTests.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
|
||||
TEST_CASE("Unit_hipMatchSync_Any") {
|
||||
SECTION("run test for int") {
|
||||
runTestMatchAny_1<int>();
|
||||
runTestMatchAny_2<int>();
|
||||
}
|
||||
SECTION("run test for unsigned int") {
|
||||
runTestMatchAny_1<unsigned int>();
|
||||
runTestMatchAny_2<unsigned int>();
|
||||
}
|
||||
SECTION("run test for long") {
|
||||
runTestMatchAny_1<long>();
|
||||
runTestMatchAny_2<long>();
|
||||
}
|
||||
SECTION("run test for unsigned long") {
|
||||
runTestMatchAny_1<unsigned long>();
|
||||
runTestMatchAny_2<unsigned long>();
|
||||
}
|
||||
SECTION("run test for long long") {
|
||||
runTestMatchAny_1<long long>();
|
||||
runTestMatchAny_2<long long>();
|
||||
}
|
||||
SECTION("run test for unsigned long long") {
|
||||
runTestMatchAny_1<unsigned long long>();
|
||||
runTestMatchAny_2<unsigned long long>();
|
||||
}
|
||||
SECTION("run test for float") {
|
||||
runTestMatchAny_1<float>();
|
||||
runTestMatchAny_2<float>();
|
||||
}
|
||||
SECTION("run test for double") {
|
||||
runTestMatchAny_1<double>();
|
||||
runTestMatchAny_2<double>();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,251 @@
|
||||
/*
|
||||
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "warp_common.hh"
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
// For all threads in the warp, shfl the value "down" by three threads. To
|
||||
// account for the end of the warp, we set the delta to zero near the warp-32
|
||||
// boundary. This also works for warp-64 since it is a multiple.
|
||||
template <typename T>
|
||||
__global__ void shflDown_1(T* Input, T *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
int srcLane = (tid % 32 + 3 < 32) ? 3 : 0;
|
||||
Output[tid] = __shfl_down_sync(AllThreads, Input[tid], srcLane);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void runTestShflDown_1() {
|
||||
const int size = 64;
|
||||
T Input[size];
|
||||
T Output[size];
|
||||
int Values[size] = {3, 4, 5, -6, 7, 8, -9, 10,
|
||||
11, 12, 13, -14, 15, 16, 17, -18,
|
||||
19, 20, -21, 22, 23, 24, 25, 26,
|
||||
-27, 28, 29, 30, 31, 29, 30, 31,
|
||||
35, -36, 37, 38, -39, 40, 41, 42,
|
||||
43, -44, -45, 46, 47, 48, 49, 50,
|
||||
-51, 52, 53, -54, 55, 56, 57, -58,
|
||||
59, 60, 61, 62, -63, 61, 62, -63};
|
||||
T Expected[size];
|
||||
|
||||
initializeInput(Input, size);
|
||||
initializeExpected(Expected, Values, size);
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
T* d_Input;
|
||||
T* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(shflDown_1<T>, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareEqual(Output[i], Expected[i]));
|
||||
}
|
||||
}
|
||||
|
||||
// Use the mask argument to divide the warp into groups of 12 threads, and then
|
||||
// shfl "down" by three threads. Account for the boundary within a group as well
|
||||
// as withing a warp-32.
|
||||
|
||||
template <typename T>
|
||||
__global__ void shflDown_2(T* Input, T *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
auto mask = __match_any_sync(AllThreads, tid / 12);
|
||||
int srcLane = ((tid % 32 + 3 >= 32) || (tid % 12 + 3 >= 12)) ? 0 : 3;
|
||||
Output[tid] = __shfl_down_sync(mask, Input[tid], srcLane);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void runTestShflDown_2() {
|
||||
const int size = 64;
|
||||
T Input[size];
|
||||
T Output[size];
|
||||
int Values[size] = {3, 4, 5, -6, 7, 8, -9, 10,
|
||||
11, -9, 10, 11, 15, 16, 17, -18,
|
||||
19, 20, -21, 22, 23, -21, 22, 23,
|
||||
-27, 28, 29, 30, 31, 29, 30, 31,
|
||||
35, 33, 34, 35, -39, 40, 41, 42,
|
||||
43, -44, -45, 46, 47, -45, 46, 47,
|
||||
-51, 52, 53, -54, 55, 56, 57, -58,
|
||||
59, 57, -58, 59, -63, 61, 62, -63};
|
||||
T Expected[size];
|
||||
|
||||
initializeInput(Input, size);
|
||||
initializeExpected(Expected, Values, size);
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
T* d_Input;
|
||||
T* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(shflDown_2<T>, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareEqual(Output[i], Expected[i]));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void shflDown_3(T* Input, T *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
auto mask = __match_any_sync(AllThreads, tid / 12);
|
||||
int srcLane = ((tid % 12 + 3 >= 12) || (tid % 8 + 3 >= 8)) ? 0 : 3;
|
||||
Output[tid] = __shfl_down_sync(mask, Input[tid], srcLane, 8);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void runTestShflDown_3() {
|
||||
const int size = 64;
|
||||
T Input[size];
|
||||
T Output[size];
|
||||
int Values[size] = {3, 4, 5, -6, 7,
|
||||
5, -6, 7, // cannot cross 8
|
||||
11, -9, 10, 11, // cannot cross 12
|
||||
15,
|
||||
13, -14, 15, // cannot cross 8
|
||||
19, 20, -21, 22,
|
||||
23, -21, 22, 23, // canot cross 12
|
||||
// pattern repeats
|
||||
-27, 28, 29, 30, 31,
|
||||
29, 30, 31,
|
||||
35, 33, 34, 35,
|
||||
-39,
|
||||
37, 38, -39,
|
||||
43, -44, -45, 46,
|
||||
47, -45, 46, 47,
|
||||
// pattern repeats
|
||||
-51, 52, 53, -54, 55,
|
||||
53, -54, 55,
|
||||
59, 57, -58, 59,
|
||||
-63,
|
||||
61, 62, -63};
|
||||
T Expected[size];
|
||||
|
||||
initializeInput(Input, size);
|
||||
initializeExpected(Expected, Values, size);
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
T* d_Input;
|
||||
T* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(shflDown_3<T>, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareEqual(Output[i], Expected[i]));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @addtogroup __shfl_sync
|
||||
* @{
|
||||
* @ingroup ShflSyncTest
|
||||
* `T __shfl_down_sync(unsigned long long mask, T var, int delta, int width=warpSize)` -
|
||||
* Contains warp __shfl sync functions.
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Test case to verify __shfl_down_sync warp functions for different datatypes.
|
||||
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/kernel/hipShflSyncDownTests.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
|
||||
TEST_CASE("Unit_hipShflSync_Down") {
|
||||
SECTION("run test for short") {
|
||||
runTestShflDown_1<short>();
|
||||
runTestShflDown_2<short>();
|
||||
runTestShflDown_3<short>();
|
||||
}
|
||||
SECTION("run test for unsigned short") {
|
||||
runTestShflDown_1<unsigned short>();
|
||||
runTestShflDown_2<unsigned short>();
|
||||
runTestShflDown_3<unsigned short>();
|
||||
}
|
||||
SECTION("run test for int") {
|
||||
runTestShflDown_1<int>();
|
||||
runTestShflDown_2<int>();
|
||||
runTestShflDown_3<int>();
|
||||
}
|
||||
SECTION("run test for unsigned int") {
|
||||
runTestShflDown_1<unsigned int>();
|
||||
runTestShflDown_2<unsigned int>();
|
||||
runTestShflDown_3<unsigned int>();
|
||||
}
|
||||
SECTION("run test for long") {
|
||||
runTestShflDown_1<long>();
|
||||
runTestShflDown_2<long>();
|
||||
runTestShflDown_3<long>();
|
||||
}
|
||||
SECTION("run test for unsigned long") {
|
||||
runTestShflDown_1<unsigned long>();
|
||||
runTestShflDown_2<unsigned long>();
|
||||
runTestShflDown_3<unsigned long>();
|
||||
}
|
||||
SECTION("run test for long long") {
|
||||
runTestShflDown_1<long long>();
|
||||
runTestShflDown_2<long long>();
|
||||
runTestShflDown_3<long long>();
|
||||
}
|
||||
SECTION("run test for unsigned long long") {
|
||||
runTestShflDown_1<unsigned long long>();
|
||||
runTestShflDown_2<unsigned long long>();
|
||||
runTestShflDown_3<unsigned long long>();
|
||||
}
|
||||
SECTION("run test for float") {
|
||||
runTestShflDown_1<float>();
|
||||
runTestShflDown_2<float>();
|
||||
runTestShflDown_3<float>();
|
||||
}
|
||||
SECTION("run test for double") {
|
||||
runTestShflDown_1<double>();
|
||||
runTestShflDown_2<double>();
|
||||
runTestShflDown_3<double>();
|
||||
}
|
||||
SECTION("run test for __half") {
|
||||
runTestShflDown_1<__half>();
|
||||
runTestShflDown_2<__half>();
|
||||
runTestShflDown_3<__half>();
|
||||
}
|
||||
SECTION("run test for __half2") {
|
||||
runTestShflDown_1<__half2>();
|
||||
runTestShflDown_2<__half2>();
|
||||
runTestShflDown_3<__half2>();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,176 @@
|
||||
/*
|
||||
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "warp_common.hh"
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
template <typename T>
|
||||
__global__ void shfl_1(T *Input, T *Output) {
|
||||
int tid = threadIdx.x;
|
||||
// Creates groups consisting of every fourth thread.
|
||||
auto mask = __match_any_sync(AllThreads, tid % 4);
|
||||
int srcLane = tid % 4;
|
||||
|
||||
// Each group reads from the first active thread within that group.
|
||||
Output[tid] = __shfl_sync(mask, Input[tid], srcLane);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void runTestShfl_1() {
|
||||
const int size = 64;
|
||||
T Input[size];
|
||||
T Output[size];
|
||||
T Expected[size];
|
||||
int Values[size] = {0, -1, 2, 3, 0, -1, 2, 3,
|
||||
0, -1, 2, 3, 0, -1, 2, 3,
|
||||
0, -1, 2, 3, 0, -1, 2, 3,
|
||||
0, -1, 2, 3, 0, -1, 2, 3,
|
||||
0, -1, 2, 3, 0, -1, 2, 3,
|
||||
0, -1, 2, 3, 0, -1, 2, 3,
|
||||
0, -1, 2, 3, 0, -1, 2, 3,
|
||||
0, -1, 2, 3, 0, -1, 2, 3};
|
||||
|
||||
initializeInput(Input, size);
|
||||
initializeExpected(Expected, Values, size);
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
T* d_Input;
|
||||
T* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(shfl_1<T>, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareEqual(Output[i], Expected[i]));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void shfl_2(T *Input, T *Output) {
|
||||
int tid = threadIdx.x;
|
||||
auto mask = __match_any_sync(AllThreads, tid % 4);
|
||||
int srcLane = tid % 4;
|
||||
|
||||
// Each subgroup of eight reads from the first active thread within that
|
||||
// subgroup.
|
||||
Output[tid] = __shfl_sync(mask, Input[tid], srcLane, 8);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void runTestShfl_2() {
|
||||
const int size = 64;
|
||||
T Input[size];
|
||||
T Output[size];
|
||||
T Expected[size];
|
||||
int Values[size] = {0, -1, 2, 3, 0, -1, 2, 3,
|
||||
8, -9, 10, 11, 8, -9, 10, 11,
|
||||
16, 17, -18, 19, 16, 17, -18, 19,
|
||||
24, 25, 26, -27, 24, 25, 26, -27,
|
||||
-32, 33, 34, 35, -32, 33, 34, 35,
|
||||
40, 41, 42, 43, 40, 41, 42, 43,
|
||||
48, 49, 50, -51, 48, 49, 50, -51,
|
||||
56, 57, -58, 59, 56, 57, -58, 59};
|
||||
|
||||
initializeInput(Input, size);
|
||||
initializeExpected(Expected, Values, size);
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
T* d_Input;
|
||||
T* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(shfl_2<T>, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareEqual(Output[i], Expected[i]));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @addtogroup __shfl_sync
|
||||
* @{
|
||||
* @ingroup ShflSyncTest
|
||||
* `T __shfl_sync(unsigned long long mask, T var, int srcLane, int width=warpSize)` -
|
||||
* Contains warp __shfl sync functions.
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Test case to verify __shfl_sync warp functions for different datatypes.
|
||||
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/kernel/hipShflSyncTests.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
|
||||
TEST_CASE("Unit_hipShflSync") {
|
||||
SECTION("run test for short") {
|
||||
runTestShfl_1<short>();
|
||||
runTestShfl_2<short>();
|
||||
}
|
||||
SECTION("run test for unsigned short") {
|
||||
runTestShfl_1<unsigned short>();
|
||||
runTestShfl_2<unsigned short>();
|
||||
}
|
||||
SECTION("run test for int") {
|
||||
runTestShfl_1<int>();
|
||||
runTestShfl_2<int>();
|
||||
}
|
||||
SECTION("run test for unsigned int") {
|
||||
runTestShfl_1<unsigned int>();
|
||||
runTestShfl_2<unsigned int>();
|
||||
}
|
||||
SECTION("run test for long") {
|
||||
runTestShfl_1<long>();
|
||||
runTestShfl_2<long>();
|
||||
}
|
||||
SECTION("run test for unsigned long") {
|
||||
runTestShfl_1<unsigned long>();
|
||||
runTestShfl_2<unsigned long>();
|
||||
}
|
||||
SECTION("run test for long long") {
|
||||
runTestShfl_1<long long>();
|
||||
runTestShfl_2<long long>();
|
||||
}
|
||||
SECTION("run test for unsigned long long") {
|
||||
runTestShfl_1<unsigned long long>();
|
||||
runTestShfl_2<unsigned long long>();
|
||||
}
|
||||
SECTION("run test for float") {
|
||||
runTestShfl_1<float>();
|
||||
runTestShfl_2<float>();
|
||||
}
|
||||
SECTION("run test for double") {
|
||||
runTestShfl_1<double>();
|
||||
runTestShfl_2<double>();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,243 @@
|
||||
/*
|
||||
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "warp_common.hh"
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
template <typename T>
|
||||
__global__ void shflUp_1(T* Input, T *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
int srcLane = (tid > 3) ? 3 : 0;
|
||||
Output[tid] = __shfl_up_sync(AllThreads, Input[tid], srcLane);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void runTestShflUp_1() {
|
||||
const int size = 64;
|
||||
T Input[size];
|
||||
T Output[size];
|
||||
T Expected[size];
|
||||
int Values[] = {0, -1, 2, 3, -1, 2, 3, 4, 5, -6, 7, 8,
|
||||
-9, 10, 11, 12, 13, -14, 15, 16, 17, -18, 19, 20,
|
||||
-21, 22, 23, 24, 25, 26, -27, 28, 29, 30, 31, -32,
|
||||
33, 34, 35, -36, 37, 38, -39, 40, 41, 42, 43, -44,
|
||||
-45, 46, 47, 48, 49, 50, -51, 52, 53, -54, 55, 56,
|
||||
57, -58, 59, 60};
|
||||
|
||||
initializeInput(Input, size);
|
||||
initializeExpected(Expected, Values, size);
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
T* d_Input;
|
||||
T* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(shflUp_1<T>, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareEqual(Output[i], Expected[i]));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void shflUp_2(T* Input, T *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
auto mask = __match_any_sync(AllThreads, tid / 12);
|
||||
int srcLane = (tid % 12) < 3 ? 0 : 3;
|
||||
Output[tid] = __shfl_up_sync(mask, Input[tid], srcLane);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void runTestShflUp_2() {
|
||||
const int size = 64;
|
||||
T Input[size];
|
||||
T Output[size];
|
||||
T Expected[size];
|
||||
int Values[size] = {0, -1, 2, 0, -1, 2, 3, 4, 5, -6, 7, 8,
|
||||
12, 13, -14, 12, 13, -14, 15, 16, 17, -18, 19, 20,
|
||||
24, 25, 26, 24, 25, 26, -27, 28, 29, 30, 31, -32,
|
||||
-36, 37, 38, -36, 37, 38, -39, 40, 41, 42, 43, -44,
|
||||
48, 49, 50, 48, 49, 50, -51, 52, 53, -54, 55, 56,
|
||||
60, 61, 62, 60};
|
||||
|
||||
initializeInput(Input, size);
|
||||
initializeExpected(Expected, Values, size);
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
T* d_Input;
|
||||
T* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(shflUp_2<T>, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareEqual(Output[i], Expected[i]));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void shflUp_3(T* Input, T *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
auto mask = __match_any_sync(AllThreads, tid / 12);
|
||||
int srcLane = (tid % 12) < 3 ? 0 : 3;
|
||||
Output[tid] = __shfl_up_sync(mask, Input[tid], srcLane, 8);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void runTestShflUp_3() {
|
||||
const int size = 64;
|
||||
T Input[size];
|
||||
T Output[size];
|
||||
T Expected[size];
|
||||
int Values[size] = {0, -1, 2, // cannot cross mod-12
|
||||
0, -1, 2, 3, 4,
|
||||
8, -9, 10, // cannot cross mod-8
|
||||
8,
|
||||
12, 13, -14, // cannot cross mod-12
|
||||
12,
|
||||
16, 17, -18, // cannot cross mod-8
|
||||
16, 17, -18, 19, 20,
|
||||
// pattern repeats
|
||||
24, 25, 26,
|
||||
24, 25, 26, -27, 28,
|
||||
-32, 33, 34,
|
||||
-32,
|
||||
-36, 37, 38,
|
||||
-36,
|
||||
40, 41, 42,
|
||||
40, 41, 42, 43, -44,
|
||||
// pattern repeats
|
||||
48, 49, 50,
|
||||
48, 49, 50, -51, 52,
|
||||
56, 57, -58,
|
||||
56,
|
||||
60, 61, 62,
|
||||
60};
|
||||
|
||||
initializeInput(Input, size);
|
||||
initializeExpected(Expected, Values, size);
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
T* d_Input;
|
||||
T* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(shflUp_3<T>, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareEqual(Output[i], Expected[i]));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @addtogroup __shfl_sync
|
||||
* @{
|
||||
* @ingroup ShflSyncTest
|
||||
* `T __shfl_up_sync(unsigned long long mask, T var, int delta, int width=warpSize)` -
|
||||
* Contains warp __shfl sync functions.
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Test case to verify __shfl_up_sync warp functions for different datatypes.
|
||||
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/kernel/hipShflSyncUpTests.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
|
||||
TEST_CASE("Unit_hipShflSync_Up") {
|
||||
SECTION("run test for short") {
|
||||
runTestShflUp_1<short>();
|
||||
runTestShflUp_2<short>();
|
||||
runTestShflUp_3<short>();
|
||||
}
|
||||
SECTION("run test for unsigned short") {
|
||||
runTestShflUp_1<unsigned short>();
|
||||
runTestShflUp_2<unsigned short>();
|
||||
runTestShflUp_3<unsigned short>();
|
||||
}
|
||||
SECTION("run test for int") {
|
||||
runTestShflUp_1<int>();
|
||||
runTestShflUp_2<int>();
|
||||
runTestShflUp_3<int>();
|
||||
}
|
||||
SECTION("run test for unsigned int") {
|
||||
runTestShflUp_1<unsigned int>();
|
||||
runTestShflUp_2<unsigned int>();
|
||||
runTestShflUp_3<unsigned int>();
|
||||
}
|
||||
SECTION("run test for long") {
|
||||
runTestShflUp_1<long>();
|
||||
runTestShflUp_2<long>();
|
||||
runTestShflUp_3<long>();
|
||||
}
|
||||
SECTION("run test for unsigned long") {
|
||||
runTestShflUp_1<unsigned long>();
|
||||
runTestShflUp_2<unsigned long>();
|
||||
runTestShflUp_3<unsigned long>();
|
||||
}
|
||||
SECTION("run test for long long") {
|
||||
runTestShflUp_1<long long>();
|
||||
runTestShflUp_2<long long>();
|
||||
runTestShflUp_3<long long>();
|
||||
}
|
||||
SECTION("run test for unsigned long long") {
|
||||
runTestShflUp_1<unsigned long long>();
|
||||
runTestShflUp_2<unsigned long long>();
|
||||
runTestShflUp_3<unsigned long long>();
|
||||
}
|
||||
SECTION("run test for float") {
|
||||
runTestShflUp_1<float>();
|
||||
runTestShflUp_2<float>();
|
||||
runTestShflUp_3<float>();
|
||||
}
|
||||
SECTION("run test for double") {
|
||||
runTestShflUp_1<double>();
|
||||
runTestShflUp_2<double>();
|
||||
runTestShflUp_3<double>();
|
||||
}
|
||||
SECTION("run test for __half") {
|
||||
runTestShflUp_1<__half>();
|
||||
runTestShflUp_2<__half>();
|
||||
runTestShflUp_3<__half>();
|
||||
}
|
||||
SECTION("run test for __half2") {
|
||||
runTestShflUp_1<__half2>();
|
||||
runTestShflUp_2<__half2>();
|
||||
runTestShflUp_3<__half2>();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,234 @@
|
||||
/*
|
||||
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "warp_common.hh"
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
template <typename T>
|
||||
__global__ void shflXor_1(T* Input, T *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
Output[tid] = __shfl_xor_sync(AllThreads, Input[tid], 16);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void runTestShflXor_1() {
|
||||
const int size = 64;
|
||||
T Input[size];
|
||||
T Output[size];
|
||||
int Values[size] = {16, 17, -18, 19, 20, -21, 22, 23,
|
||||
24, 25, 26, -27, 28, 29, 30, 31,
|
||||
0, -1, 2, 3, 4, 5, -6, 7,
|
||||
8, -9, 10, 11, 12, 13, -14, 15,
|
||||
48, 49, 50, -51, 52, 53, -54, 55,
|
||||
56, 57, -58, 59, 60, 61, 62, -63,
|
||||
-32, 33, 34, 35, -36, 37, 38, -39,
|
||||
40, 41, 42, 43, -44, -45, 46, 47};
|
||||
T Expected[size];
|
||||
|
||||
initializeInput(Input, size);
|
||||
initializeExpected(Expected, Values, size);
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
T* d_Input;
|
||||
T* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(shflXor_1<T>, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareEqual(Output[i], Expected[i]));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void shflXor_2(T* Input, T *Output) {
|
||||
unsigned tid = threadIdx.x;
|
||||
auto mask = __match_any_sync(AllThreads, tid / 12);
|
||||
int laneMask = 4;
|
||||
int section = tid % 24;
|
||||
if (section > 7 && section < 16)
|
||||
laneMask = 0;
|
||||
Output[tid] = __shfl_xor_sync(mask, Input[tid], laneMask);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void runTestShflXor_2() {
|
||||
const int size = 64;
|
||||
T Input[size];
|
||||
T Output[size];
|
||||
int Values[size] = {4, 5, -6, 7, 0, -1, 2, 3,
|
||||
8, -9, 10, 11, 12, 13, -14, 15, // disabled around mid mod-24
|
||||
20, -21, 22, 23, 16, 17, -18, 19,
|
||||
28, 29, 30, 31, 24, 25, 26, -27,
|
||||
-32, 33, 34, 35, -36, 37, 38, -39, // disabled around mid mod-24
|
||||
-44, -45, 46, 47, 40, 41, 42, 43,
|
||||
52, 53, -54, 55, 48, 49, 50, -51,
|
||||
56, 57, -58, 59, 60, 61, 62, -63}; // disabled around mid mod-24
|
||||
T Expected[size];
|
||||
|
||||
initializeInput(Input, size);
|
||||
initializeExpected(Expected, Values, size);
|
||||
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
T* d_Input;
|
||||
T* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(shflXor_2<T>, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareEqual(Output[i], Expected[i]));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void shflXor_3(T* Input, T *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
auto mask = __match_any_sync(AllThreads, tid / 16);
|
||||
Output[tid] = __shfl_xor_sync(mask, Input[tid], 4, 8);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void runTestShflXor_3() {
|
||||
const int size = 64;
|
||||
T Input[size];
|
||||
T Output[size];
|
||||
int Values[size] = {4, 5, -6, 7, 0, -1, 2, 3,
|
||||
12, 13, -14, 15, 8, -9, 10, 11,
|
||||
20, -21, 22, 23, 16, 17, -18, 19,
|
||||
28, 29, 30, 31, 24, 25, 26, -27,
|
||||
-36, 37, 38, -39, -32, 33, 34, 35,
|
||||
-44, -45, 46, 47, 40, 41, 42, 43,
|
||||
52, 53, -54, 55, 48, 49, 50, -51,
|
||||
60, 61, 62, -63, 56, 57, -58, 59};
|
||||
T Expected[size];
|
||||
|
||||
initializeInput(Input, size);
|
||||
initializeExpected(Expected, Values, size);
|
||||
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
T* d_Input;
|
||||
T* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, sizeof(T) * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, sizeof(T) * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, sizeof(T) * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(shflXor_3<T>, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, sizeof(T) * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareEqual(Output[i], Expected[i]));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @addtogroup __shfl_sync
|
||||
* @{
|
||||
* @ingroup ShflSyncTest
|
||||
* `T __shfl_xor_sync(unsigned long long mask, T var, unsigned laneMask, int width=warpSize)` -
|
||||
* Contains warp __shfl sync functions.
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Test case to verify __shfl_xor_sync warp functions for different datatypes.
|
||||
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/kernel/hipShflSyncXorTests.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
|
||||
TEST_CASE("Unit_hipShflSync_Xor") {
|
||||
SECTION("run test for short") {
|
||||
runTestShflXor_1<short>();
|
||||
runTestShflXor_2<short>();
|
||||
runTestShflXor_3<short>();
|
||||
}
|
||||
SECTION("run test for unsigned short") {
|
||||
runTestShflXor_1<unsigned short>();
|
||||
runTestShflXor_2<unsigned short>();
|
||||
runTestShflXor_3<unsigned short>();
|
||||
}
|
||||
SECTION("run test for int") {
|
||||
runTestShflXor_1<int>();
|
||||
runTestShflXor_2<int>();
|
||||
runTestShflXor_3<int>();
|
||||
}
|
||||
SECTION("run test for unsigned int") {
|
||||
runTestShflXor_1<unsigned int>();
|
||||
runTestShflXor_2<unsigned int>();
|
||||
runTestShflXor_3<unsigned int>();
|
||||
}
|
||||
SECTION("run test for long") {
|
||||
runTestShflXor_1<long>();
|
||||
runTestShflXor_2<long>();
|
||||
runTestShflXor_3<long>();
|
||||
}
|
||||
SECTION("run test for unsigned long") {
|
||||
runTestShflXor_1<unsigned long>();
|
||||
runTestShflXor_2<unsigned long>();
|
||||
runTestShflXor_3<unsigned long>();
|
||||
}
|
||||
SECTION("run test for long long") {
|
||||
runTestShflXor_1<long long>();
|
||||
runTestShflXor_2<long long>();
|
||||
runTestShflXor_3<long long>();
|
||||
}
|
||||
SECTION("run test for unsigned long long") {
|
||||
runTestShflXor_1<unsigned long long>();
|
||||
runTestShflXor_2<unsigned long long>();
|
||||
runTestShflXor_3<unsigned long long>();
|
||||
}
|
||||
SECTION("run test for float") {
|
||||
runTestShflXor_1<float>();
|
||||
runTestShflXor_2<float>();
|
||||
runTestShflXor_3<float>();
|
||||
}
|
||||
SECTION("run test for double") {
|
||||
runTestShflXor_1<double>();
|
||||
runTestShflXor_2<double>();
|
||||
runTestShflXor_3<double>();
|
||||
}
|
||||
SECTION("run test for __half") {
|
||||
runTestShflXor_1<__half>();
|
||||
runTestShflXor_2<__half>();
|
||||
runTestShflXor_3<__half>();
|
||||
}
|
||||
SECTION("run test for __half2") {
|
||||
runTestShflXor_1<__half2>();
|
||||
runTestShflXor_2<__half2>();
|
||||
runTestShflXor_3<__half2>();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,595 @@
|
||||
/*
|
||||
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "warp_common.hh"
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
__global__ void any_1(int *Input, int *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
Output[tid] = __any_sync(AllThreads, Input[tid]);
|
||||
}
|
||||
|
||||
static void runTestAny_1() {
|
||||
const int size = 64;
|
||||
int Input[size] = {0, };
|
||||
int Output[size];
|
||||
int Expected[size] = {0, };
|
||||
|
||||
int* d_Input;
|
||||
int* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, 4 * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 4 * size));
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(any_1, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(Output[i] == Expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void any_2(int *Input, int *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
Output[tid] = __any_sync(AllThreads, Input[tid]);
|
||||
}
|
||||
|
||||
static void runTestAny_2_w64() {
|
||||
const int size = 64;
|
||||
int Input[size] = {0, };
|
||||
int Output[size];
|
||||
int Expected[size] = {0, };
|
||||
|
||||
Input[60] = 1;
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
if (warpSize == 64)
|
||||
std::fill_n(Expected, size, 1);
|
||||
|
||||
int* d_Input;
|
||||
int* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, 4 * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 4 * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(any_2, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(Output[i] == Expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void runTestAny_2_w32() {
|
||||
const int size = 64;
|
||||
int Input[size] = {0, };
|
||||
int Output[size];
|
||||
int Expected[size] = {0, };
|
||||
|
||||
Input[30] = 1;
|
||||
std::fill_n(Expected, size, 1);
|
||||
|
||||
int* d_Input;
|
||||
int* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, 4 * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 4 * size));
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(any_2, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(Output[i] == Expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void any_3(int *Input, int *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
auto mask = __match_any_sync(AllThreads, tid/12);
|
||||
Output[tid] = __any_sync(mask, Input[tid]);
|
||||
}
|
||||
|
||||
static void runTestAny_3() {
|
||||
const int size = 64;
|
||||
int Input[size] = {0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 1, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 1, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 1, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 1, 0, 0};
|
||||
|
||||
int Output[size];
|
||||
int Expected[size] = {0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1,
|
||||
1, 1, 1, 1};
|
||||
int* d_Input;
|
||||
int* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, 4 * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 4 * size));
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(any_3, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(Output[i] == Expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void any_4(int *Input, int *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
unsigned long long masks[2] = { Every5thBut9th, Every9thBit };
|
||||
|
||||
Output[tid] = -1;
|
||||
if (tid % 5 == 0 || tid % 9 == 0)
|
||||
Output[tid] = __any_sync(masks[tid % 9 == 0], Input[tid]);
|
||||
}
|
||||
|
||||
static void runTestAny_4() {
|
||||
const int size = 64;
|
||||
int Input[size] = {0, };
|
||||
Input[5] = 1;
|
||||
|
||||
int Output[size];
|
||||
int Expected[size];
|
||||
|
||||
for (int i = 0; i != size; ++i) {
|
||||
if (i % 9 == 0) {
|
||||
Expected[i] = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (i % 5 == 0) {
|
||||
Expected[i] = 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
Expected[i] = -1;
|
||||
}
|
||||
|
||||
int* d_Input;
|
||||
int* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, 4 * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 4 * size));
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(any_4, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(Output[i] == Expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void all_1(int *Input, int *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
Output[tid] = __all_sync(AllThreads, Input[tid]);
|
||||
}
|
||||
|
||||
static void runTestAll_1_w64() {
|
||||
const int size = 64;
|
||||
int Input[size] = {1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 0, 1, 1};
|
||||
|
||||
int Output[size];
|
||||
int Expected[size] = {0, };
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
if (warpSize == 32)
|
||||
std::fill_n(Expected, size, 1);
|
||||
|
||||
int* d_Input;
|
||||
int* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, 4 * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 4 * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(all_1, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(Output[i] == Expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void runTestAll_1_w32() {
|
||||
const int size = 64;
|
||||
int Input[size] = {1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 0, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1};
|
||||
|
||||
int Output[size];
|
||||
int Expected[size] = {0, };
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
int* d_Input;
|
||||
int* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, 4 * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 4 * size));
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(all_1, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(Output[i] == Expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void all_2(int *Input, int *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
Output[tid] = __all_sync(AllThreads, Input[tid]);
|
||||
}
|
||||
|
||||
static void runTestAll_2() {
|
||||
const int size = 64;
|
||||
int Input[size] = {1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1};
|
||||
|
||||
int Output[size];
|
||||
int Expected[size] = {1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1};
|
||||
|
||||
int* d_Input;
|
||||
int* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, 4 * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 4 * size));
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(all_2, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(Output[i] == Expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void all_3(int *Input, int *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
auto mask = __match_any_sync(AllThreads, tid/12);
|
||||
Output[tid] = __all_sync(mask, Input[tid]);
|
||||
}
|
||||
|
||||
static void runTestAll_3() {
|
||||
const int size = 64;
|
||||
int Input[size] = {1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 0, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 0, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 0, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 0, 1, 1};
|
||||
|
||||
int Output[size];
|
||||
int Expected[size] = {1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0};
|
||||
int* d_Input;
|
||||
int* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, 4 * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 4 * size));
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(all_3, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(Output[i] == Expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void all_4(int *Input, int *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
unsigned long long masks[2] = { Every5thBut9th, Every9thBit };
|
||||
|
||||
Output[tid] = -1;
|
||||
if (tid % 5 == 0 || tid % 9 == 0)
|
||||
Output[tid] = __all_sync(masks[tid % 9 == 0], Input[tid]);
|
||||
}
|
||||
|
||||
static void runTestAll_4() {
|
||||
const int size = 64;
|
||||
int Input[size];
|
||||
std::fill_n(Input, size, 1);
|
||||
Input[5] = 0;
|
||||
|
||||
int Output[size];
|
||||
int Expected[size];
|
||||
|
||||
for (int i = 0; i != size; ++i) {
|
||||
if (i % 9 == 0) {
|
||||
Expected[i] = 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (i % 5 == 0) {
|
||||
Expected[i] = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
Expected[i] = -1;
|
||||
}
|
||||
|
||||
int* d_Input;
|
||||
int* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, 4 * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 4 * size));
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(all_4, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 4 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(Output[i] == Expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void ballot_1(int *Input, unsigned long long *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
Output[tid] = __ballot_sync(AllThreads, Input[tid]);
|
||||
}
|
||||
|
||||
static void runTestBallot_1() {
|
||||
const int size = 64;
|
||||
int Input[size] = {0, 1, 0, 0, 1, 1, 1, 0,
|
||||
0, 1, 1, 1, 0, 0, 1, 0,
|
||||
1, 1, 1, 0, 0, 1, 0, 0,
|
||||
1, 0, 0, 1, 0, 0, 1, 1,
|
||||
0, 1, 0, 0, 1, 1, 1, 0,
|
||||
0, 1, 1, 1, 0, 0, 1, 0,
|
||||
1, 1, 1, 0, 0, 1, 0, 0,
|
||||
1, 0, 0, 1, 0, 0, 1, 1};
|
||||
unsigned long long Output[size];
|
||||
unsigned long long Expected[size] = {
|
||||
0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
|
||||
0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
|
||||
0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
|
||||
0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
|
||||
0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
|
||||
0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
|
||||
0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
|
||||
0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
|
||||
0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
|
||||
0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
|
||||
0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
|
||||
0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
|
||||
0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
|
||||
0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
|
||||
0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72,
|
||||
0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72, 0xc9274e72c9274e72
|
||||
};
|
||||
|
||||
int* d_Input;
|
||||
unsigned long long* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, 4 * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 8 * size));
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(ballot_1, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareMaskEqual(Output, Expected, i, warpSize));
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void ballot_2(int *Input, unsigned long long *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
auto mask = __match_any_sync(AllThreads, tid / 12);
|
||||
Output[tid] = __ballot_sync(mask, Input[tid]);
|
||||
}
|
||||
|
||||
static void runTestBallot_2() {
|
||||
const int size = 64;
|
||||
int Input[size] = {0, 1, 0, 0, 1, 1, 1, 0,
|
||||
0, 1, 1, 1, 0, 0, 1, 0,
|
||||
1, 1, 1, 0, 0, 1, 0, 0,
|
||||
1, 0, 0, 1, 0, 0, 1, 1,
|
||||
0, 1, 0, 0, 1, 1, 1, 0,
|
||||
0, 1, 1, 1, 0, 0, 1, 0,
|
||||
1, 1, 1, 0, 0, 1, 0, 0,
|
||||
1, 0, 0, 1, 0, 0, 1, 1};
|
||||
unsigned long long Output[size];
|
||||
unsigned long long Expected[size] = {
|
||||
0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72,
|
||||
0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72,
|
||||
0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72, 0x0000000000000e72,
|
||||
0x0000000000274000, 0x0000000000274000, 0x0000000000274000, 0x0000000000274000,
|
||||
0x0000000000274000, 0x0000000000274000, 0x0000000000274000, 0x0000000000274000,
|
||||
0x0000000000274000, 0x0000000000274000, 0x0000000000274000, 0x0000000000274000,
|
||||
0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000,
|
||||
0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000,
|
||||
0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000, 0x00000002c9000000,
|
||||
0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000,
|
||||
0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000,
|
||||
0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000, 0x00004e7000000000,
|
||||
0x0927000000000000, 0x0927000000000000, 0x0927000000000000, 0x0927000000000000,
|
||||
0x0927000000000000, 0x0927000000000000, 0x0927000000000000, 0x0927000000000000,
|
||||
0x0927000000000000, 0x0927000000000000, 0x0927000000000000, 0x0927000000000000,
|
||||
0xc000000000000000, 0xc000000000000000, 0xc000000000000000, 0xc000000000000000};
|
||||
|
||||
int* d_Input;
|
||||
unsigned long long* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, 4 * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 8 * size));
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(ballot_2, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareMaskEqual(Output, Expected, i, warpSize));
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void ballot_3(int *Input, unsigned long long *Output) {
|
||||
auto tid = threadIdx.x;
|
||||
unsigned long long masks[2] = { Every5thBut9th, Every9thBit };
|
||||
|
||||
Output[tid] = -1;
|
||||
if (tid % 5 == 0 || tid % 9 == 0)
|
||||
Output[tid] = __ballot_sync(masks[tid % 9 == 0], Input[tid]);
|
||||
}
|
||||
|
||||
static void runTestBallot_3() {
|
||||
const int size = 64;
|
||||
int Input[size];
|
||||
std::fill_n(Input, size, 1);
|
||||
|
||||
unsigned long long Output[size];
|
||||
unsigned long long Expected[size];
|
||||
|
||||
for (int i = 0; i != size; ++i) {
|
||||
if (i % 9 == 0) {
|
||||
Expected[i] = Every9thBit;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (i % 5 == 0) {
|
||||
Expected[i] = Every5thBut9th;
|
||||
continue;
|
||||
}
|
||||
|
||||
Expected[i] = -1;
|
||||
}
|
||||
|
||||
int* d_Input;
|
||||
unsigned long long* d_Output;
|
||||
HIP_CHECK(hipMalloc(&d_Input, 4 * size));
|
||||
HIP_CHECK(hipMalloc(&d_Output, 8 * size));
|
||||
|
||||
int warpSize = getWarpSize();
|
||||
|
||||
HIP_CHECK(hipMemcpy(d_Input, &Input, 4 * size, hipMemcpyDefault));
|
||||
hipLaunchKernelGGL(ballot_3, 1, warpSize, 0, 0, d_Input, d_Output);
|
||||
|
||||
HIP_CHECK(hipMemcpy(&Output, d_Output, 8 * size, hipMemcpyDefault));
|
||||
for (int i = 0; i != warpSize; ++i) {
|
||||
REQUIRE(compareMaskEqual(Output, Expected, i, warpSize));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @addtogroup __vote_sync
|
||||
* @{
|
||||
* @ingroup VoteSyncTest
|
||||
*
|
||||
* - `unsigned long long __any_sync(unsigned long long mask, int predicate)`
|
||||
* - `unsigned long long __all_sync(unsigned long long mask, int predicate)`
|
||||
* - `unsigned long long __ballot_sync(unsigned long long mask, int predicate)`
|
||||
*
|
||||
* Contains warp vote sync functions.
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Test cases to verify warp vote functions.
|
||||
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - catch/unit/kernel/hipVoteSyncTests.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
|
||||
TEST_CASE("Unit_hipVoteSync_Any") {
|
||||
runTestAny_1();
|
||||
runTestAny_2_w64();
|
||||
runTestAny_2_w32();
|
||||
runTestAny_3();
|
||||
runTestAny_4();
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipVoteSync_All") {
|
||||
runTestAll_1_w64();
|
||||
runTestAll_1_w32();
|
||||
runTestAll_2();
|
||||
runTestAll_3();
|
||||
runTestAll_4();
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipVoteSync_Ballot") {
|
||||
runTestBallot_1();
|
||||
runTestBallot_2();
|
||||
runTestBallot_3();
|
||||
}
|
||||
@@ -19,11 +19,18 @@ THE SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#define HIP_ENABLE_WARP_SYNC_BUILTINS
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
|
||||
static __device__ bool deactivate_thread(const uint64_t* const active_masks) {
|
||||
const unsigned long long Every5thBit = 0x1084210842108421;
|
||||
const unsigned long long Every9thBit = 0x8040201008040201;
|
||||
const unsigned long long Every5thBut9th = Every5thBit & ~Every9thBit;
|
||||
const unsigned long long AllThreads = ~0;
|
||||
|
||||
inline __device__ bool deactivate_thread(const uint64_t* const active_masks) {
|
||||
const auto warp =
|
||||
cooperative_groups::tiled_partition(cooperative_groups::this_thread_block(), warpSize);
|
||||
const auto block = cooperative_groups::this_thread_block();
|
||||
@@ -34,17 +41,17 @@ static __device__ bool deactivate_thread(const uint64_t* const active_masks) {
|
||||
return !(active_masks[idx] & (static_cast<uint64_t>(1) << warp.thread_rank()));
|
||||
}
|
||||
|
||||
static inline std::mt19937& GetRandomGenerator() {
|
||||
inline std::mt19937& GetRandomGenerator() {
|
||||
static std::mt19937 mt(std::random_device{}());
|
||||
return mt;
|
||||
}
|
||||
|
||||
template <typename T> static inline T GenerateRandomInteger(const T min, const T max) {
|
||||
template <typename T> inline T GenerateRandomInteger(const T min, const T max) {
|
||||
std::uniform_int_distribution<T> dist(min, max);
|
||||
return dist(GetRandomGenerator());
|
||||
}
|
||||
|
||||
template <typename T> static inline T GenerateRandomReal(const T min, const T max) {
|
||||
template <typename T> inline T GenerateRandomReal(const T min, const T max) {
|
||||
std::uniform_real_distribution<T> dist(min, max);
|
||||
return dist(GetRandomGenerator());
|
||||
}
|
||||
@@ -83,3 +90,78 @@ inline uint64_t get_active_mask(unsigned int warp_id, unsigned int warp_size) {
|
||||
}
|
||||
return active_mask;
|
||||
}
|
||||
|
||||
template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
|
||||
inline T expandPrecision(int X) { return X; }
|
||||
|
||||
template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
|
||||
inline T expandPrecision(int X) {
|
||||
return X * 3.141592653589793115997963468544185161590576171875;
|
||||
}
|
||||
|
||||
template <typename T, std::enable_if_t<std::is_same<T, __half>::value, bool> = true>
|
||||
inline __half expandPrecision(int X) {
|
||||
return (__half)expandPrecision<float>(X);
|
||||
}
|
||||
|
||||
template <typename T, std::enable_if_t<std::is_same<T, __half2>::value, bool> = true>
|
||||
inline __half2 expandPrecision(int X) {
|
||||
__half H = expandPrecision<float>(X);
|
||||
return {H, H};
|
||||
}
|
||||
|
||||
template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
|
||||
inline void expandPrecision(T* Array, int size) {
|
||||
(void)Array;
|
||||
(void)size;
|
||||
}
|
||||
|
||||
template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true>
|
||||
inline void expandPrecision(T *Array, int size) {
|
||||
for (int i = 0; i != size; ++i) {
|
||||
Array[i] *= 3.141592653589793115997963468544185161590576171875;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void initializeInput(T *Input, int size) {
|
||||
int Values[] = {0, -1, 2, 3, 4, 5, -6, 7,
|
||||
8, -9, 10, 11, 12, 13, -14, 15,
|
||||
16, 17, -18, 19, 20, -21, 22, 23,
|
||||
24, 25, 26, -27, 28, 29, 30, 31,
|
||||
-32, 33, 34, 35, -36, 37, 38, -39,
|
||||
40, 41, 42, 43, -44, -45, 46, 47,
|
||||
48, 49, 50, -51, 52, 53, -54, 55,
|
||||
56, 57, -58, 59, 60, 61, 62, -63};
|
||||
|
||||
for (int i = 0; i != size; ++i) {
|
||||
Input[i] = expandPrecision<T>(Values[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void initializeExpected(T *Expected, int *Values, int size) {
|
||||
for (int i = 0; i != size; ++i) {
|
||||
Expected[i] = expandPrecision<T>(Values[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline bool compareEqual(T X, T Y) { return X == Y; }
|
||||
|
||||
template <>
|
||||
inline bool compareEqual(__half X, __half Y) {
|
||||
return __half2float(X) == __half2float(Y);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline bool compareEqual(__half2 X, __half2 Y) {
|
||||
return compareEqual(X.x, Y.x) && compareEqual(X.y, Y.y);
|
||||
}
|
||||
|
||||
inline bool compareMaskEqual(unsigned long long *Actual, unsigned long long *Expected,
|
||||
int i, int warpSize) {
|
||||
if (warpSize == 32)
|
||||
return (unsigned)Actual[i] == (unsigned)Expected[i];
|
||||
return Actual[i] == Expected[i];
|
||||
}
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren