SWDEV-289409 - Batch 2 of device tests
Change-Id: I3976b5565bf040e77196ca797afb3f71f0e81d6c
[ROCm/hip-tests commit: 2498b5ebe4]
Этот коммит содержится в:
коммит произвёл
Jatin Chaudhary
родитель
702dc14c79
Коммит
f9a0c3a0d6
@@ -7,11 +7,18 @@ set(TEST_SRC
|
||||
ffs.cc
|
||||
funnelshift.cc
|
||||
brev.cc
|
||||
popc.cc
|
||||
ldg.cc
|
||||
syncthreadsand.cc
|
||||
syncthreadscount.cc
|
||||
syncthreadsor.cc
|
||||
threadfence_system.cc
|
||||
)
|
||||
|
||||
# AMD only tests
|
||||
set(AMD_TEST_SRC
|
||||
vectorTypesDevice.cc
|
||||
mbcnt.cc
|
||||
bitExtract.cc
|
||||
bitInsert.cc
|
||||
floatTM.cc
|
||||
|
||||
@@ -0,0 +1,250 @@
|
||||
/*
|
||||
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
|
||||
constexpr int WIDTH = 16;
|
||||
constexpr int HEIGHT = 16;
|
||||
constexpr int NUM = WIDTH * HEIGHT;
|
||||
|
||||
constexpr int THREADS_PER_BLOCK_X = 8;
|
||||
constexpr int THREADS_PER_BLOCK_Y = 8;
|
||||
|
||||
template <typename T>
|
||||
__global__ void vectoradd_float(T* a, const T* bm, int width, int height)
|
||||
|
||||
{
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
int i = y * width + x;
|
||||
if (i < (width * height)) {
|
||||
a[i] = __ldg(&bm[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int2 make_vector2(int a) { return make_int2(a, a); }
|
||||
|
||||
char2 make_vector2(signed char a) { return make_char2(a, a); }
|
||||
|
||||
char4 make_vector4(signed char a) { return make_char4(a, a, a, a); }
|
||||
|
||||
short2 make_vector2(short a) { return make_short2(a, a); }
|
||||
|
||||
ushort2 make_vector2(unsigned short a) { return make_ushort2(a, a); }
|
||||
|
||||
short4 make_vector4(short a) { return make_short4(a, a, a, a); }
|
||||
|
||||
int4 make_vector4(int a) { return make_int4(a, a, a, a); }
|
||||
|
||||
uint2 make_vector2(unsigned int a) { return make_uint2(a, a); }
|
||||
|
||||
uint4 make_vector4(unsigned int a) { return make_uint4(a, a, a, a); }
|
||||
|
||||
float2 make_vector2(float a) { return make_float2(a, a); }
|
||||
|
||||
float4 make_vector4(float a) { return make_float4(a, a, a, a); }
|
||||
|
||||
uchar2 make_vector2(unsigned char a) { return make_uchar2(a, a); }
|
||||
|
||||
uchar4 make_vector4(unsigned char a) { return make_uchar4(a, a, a, a); }
|
||||
|
||||
double2 make_vector2(double a) { return make_double2(a, a); }
|
||||
|
||||
|
||||
template <typename T, typename U> int dataTypesRun() {
|
||||
T* hostA;
|
||||
T* hostB;
|
||||
|
||||
T* deviceA;
|
||||
T* deviceB;
|
||||
|
||||
int i;
|
||||
int errors;
|
||||
|
||||
hostA = (T*)malloc(NUM * sizeof(T));
|
||||
hostB = (T*)malloc(NUM * sizeof(T));
|
||||
|
||||
// initialize the input data
|
||||
for (i = 0; i < NUM; i++) {
|
||||
hostB[i] = (U)i;
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
|
||||
HIP_CHECK(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
|
||||
|
||||
HIP_CHECK(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice));
|
||||
|
||||
|
||||
hipLaunchKernelGGL(vectoradd_float,
|
||||
dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
|
||||
static_cast<const T*>(deviceB), WIDTH, HEIGHT);
|
||||
|
||||
|
||||
HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(T), hipMemcpyDeviceToHost));
|
||||
|
||||
// verify the results
|
||||
errors = 0;
|
||||
for (i = 0; i < NUM; i++) {
|
||||
if (hostA[i] != (hostB[i])) {
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
|
||||
HIP_CHECK(hipFree(deviceA));
|
||||
HIP_CHECK(hipFree(deviceB));
|
||||
|
||||
free(hostA);
|
||||
free(hostB);
|
||||
|
||||
return errors;
|
||||
}
|
||||
|
||||
|
||||
template <typename T, typename U> int dataTypesRun2() {
|
||||
T* hostA;
|
||||
T* hostB;
|
||||
|
||||
T* deviceA;
|
||||
T* deviceB;
|
||||
|
||||
int i;
|
||||
int errors;
|
||||
|
||||
hostA = (T*)malloc(NUM * sizeof(T));
|
||||
hostB = (T*)malloc(NUM * sizeof(T));
|
||||
|
||||
// initialize the input data
|
||||
for (i = 0; i < NUM; i++) {
|
||||
hostB[i] = make_vector2((U)i);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
|
||||
HIP_CHECK(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
|
||||
|
||||
HIP_CHECK(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice));
|
||||
hipLaunchKernelGGL(vectoradd_float,
|
||||
dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
|
||||
static_cast<const T*>(deviceB), WIDTH, HEIGHT);
|
||||
|
||||
|
||||
HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(T), hipMemcpyDeviceToHost));
|
||||
|
||||
// verify the results
|
||||
errors = 0;
|
||||
for (i = 0; i < NUM; i++) {
|
||||
if (hostA[i].x != (hostB[i].x) && hostA[i].y != (hostB[i].y)) {
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
|
||||
HIP_CHECK(hipFree(deviceA));
|
||||
HIP_CHECK(hipFree(deviceB));
|
||||
|
||||
free(hostA);
|
||||
free(hostB);
|
||||
|
||||
return errors;
|
||||
}
|
||||
|
||||
|
||||
template <typename T, typename U> int dataTypesRun4() {
|
||||
T* hostA;
|
||||
T* hostB;
|
||||
|
||||
T* deviceA;
|
||||
T* deviceB;
|
||||
|
||||
int i;
|
||||
int errors;
|
||||
|
||||
hostA = (T*)malloc(NUM * sizeof(T));
|
||||
hostB = (T*)malloc(NUM * sizeof(T));
|
||||
|
||||
// initialize the input data
|
||||
for (i = 0; i < NUM; i++) {
|
||||
hostB[i] = make_vector4((U)i);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
|
||||
HIP_CHECK(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
|
||||
|
||||
HIP_CHECK(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice));
|
||||
|
||||
|
||||
hipLaunchKernelGGL(vectoradd_float,
|
||||
dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
|
||||
static_cast<const T*>(deviceB), WIDTH, HEIGHT);
|
||||
|
||||
|
||||
HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(T), hipMemcpyDeviceToHost));
|
||||
|
||||
// verify the results
|
||||
errors = 0;
|
||||
for (i = 0; i < NUM; i++) {
|
||||
if (hostA[i].x != (hostB[i].x) && hostA[i].y != (hostB[i].y) && hostA[i].z != (hostB[i].z) &&
|
||||
hostA[i].w != (hostB[i].w)) {
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
|
||||
HIP_CHECK(hipFree(deviceA));
|
||||
HIP_CHECK(hipFree(deviceB));
|
||||
|
||||
free(hostA);
|
||||
free(hostB);
|
||||
|
||||
return errors;
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_ldg") {
|
||||
using namespace std;
|
||||
|
||||
int errors;
|
||||
|
||||
errors = dataTypesRun<char, char>() | dataTypesRun<short, short>() | dataTypesRun<int, int>() |
|
||||
dataTypesRun<long, long>() | dataTypesRun<long long, long long>() |
|
||||
dataTypesRun<signed char, signed char>() | dataTypesRun<unsigned char, unsigned char>() |
|
||||
dataTypesRun<unsigned short, unsigned short>() | dataTypesRun<unsigned int, unsigned int>() |
|
||||
dataTypesRun<unsigned long, unsigned long>() |
|
||||
dataTypesRun<unsigned long long, unsigned long long>() | dataTypesRun<float, float>() |
|
||||
dataTypesRun<double, double>();
|
||||
|
||||
REQUIRE(errors == 0);
|
||||
|
||||
errors = dataTypesRun2<int2, int>() | dataTypesRun2<short2, short>() |
|
||||
dataTypesRun2<ushort2, unsigned short>() | dataTypesRun2<char2, signed char>() |
|
||||
dataTypesRun2<uchar2, unsigned char>() | dataTypesRun2<uint2, unsigned int>() |
|
||||
dataTypesRun2<float2, float>() | dataTypesRun2<double2, double>();
|
||||
|
||||
REQUIRE(errors == 0);
|
||||
|
||||
errors = dataTypesRun4<int4, int>() | dataTypesRun4<char4, signed char>() |
|
||||
dataTypesRun4<uchar4, unsigned char>() | dataTypesRun4<short4, short>() |
|
||||
dataTypesRun4<uint4, unsigned int>() | dataTypesRun4<float4, float>();
|
||||
|
||||
REQUIRE(errors == 0);
|
||||
}
|
||||
@@ -0,0 +1,103 @@
|
||||
/*
|
||||
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include <stdlib.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/device_functions.h>
|
||||
|
||||
__global__ void HIP_kernel(unsigned int* mbcnt_lo, unsigned int* mbcnt_hi, unsigned int* lane_id) {
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
mbcnt_lo[x] = __builtin_amdgcn_mbcnt_lo(0xFFFFFFFF, 0);
|
||||
mbcnt_hi[x] = __builtin_amdgcn_mbcnt_hi(0xFFFFFFFF, 0);
|
||||
lane_id[x] = __lane_id();
|
||||
}
|
||||
|
||||
|
||||
TEST_CASE("Unit_mbcnt") {
|
||||
using namespace std;
|
||||
unsigned int* device_mbcnt_lo;
|
||||
unsigned int* device_mbcnt_hi;
|
||||
unsigned int* device_lane_id;
|
||||
|
||||
hipDeviceProp_t devProp;
|
||||
HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
|
||||
INFO("System minor : " << devProp.minor);
|
||||
INFO("System major : " << devProp.major);
|
||||
INFO("agent prop name : " << devProp.name);
|
||||
|
||||
INFO("hip Device prop succeeded");
|
||||
|
||||
constexpr unsigned int num_waves_per_block = 2;
|
||||
const unsigned int wave_size = devProp.warpSize;
|
||||
const unsigned int num_threads_per_block = wave_size * num_waves_per_block;
|
||||
const unsigned int num_blocks = 2;
|
||||
const unsigned int num_threads = num_threads_per_block * num_blocks;
|
||||
const size_t buffer_size = num_threads * sizeof(unsigned int);
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)&device_mbcnt_lo, buffer_size));
|
||||
HIP_CHECK(hipMalloc((void**)&device_mbcnt_hi, buffer_size));
|
||||
HIP_CHECK(hipMalloc((void**)&device_lane_id, buffer_size));
|
||||
|
||||
hipLaunchKernelGGL(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block), 0, 0,
|
||||
device_mbcnt_lo, device_mbcnt_hi, device_lane_id);
|
||||
|
||||
unsigned int* host_mbcnt_lo = (unsigned int*)malloc(buffer_size);
|
||||
unsigned int* host_mbcnt_hi = (unsigned int*)malloc(buffer_size);
|
||||
unsigned int* host_lane_id = (unsigned int*)malloc(buffer_size);
|
||||
|
||||
HIP_CHECK(hipMemcpy(host_mbcnt_lo, device_mbcnt_lo, buffer_size, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(host_mbcnt_hi, device_mbcnt_hi, buffer_size, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(host_lane_id, device_lane_id, buffer_size, hipMemcpyDeviceToHost));
|
||||
|
||||
// verify the results
|
||||
int mbcnt_lo_errors = 0;
|
||||
int mbcnt_hi_errors = 0;
|
||||
int lane_id_errors = 0;
|
||||
for (unsigned int i = 0; i < num_threads; i++) {
|
||||
unsigned int this_lane_id = i % wave_size;
|
||||
unsigned int this_mbcnt_lo = this_lane_id >= 32 ? 32 : this_lane_id;
|
||||
unsigned int this_mbcnt_hi = this_lane_id < 32 ? 0 : (this_lane_id - 32);
|
||||
|
||||
if (host_mbcnt_lo[i] != this_mbcnt_lo) mbcnt_lo_errors++;
|
||||
|
||||
if (host_mbcnt_hi[i] != this_mbcnt_hi) mbcnt_hi_errors++;
|
||||
|
||||
if (host_lane_id[i] != this_lane_id) lane_id_errors++;
|
||||
}
|
||||
|
||||
|
||||
HIP_CHECK(hipFree(device_mbcnt_lo));
|
||||
HIP_CHECK(hipFree(device_mbcnt_hi));
|
||||
HIP_CHECK(hipFree(device_lane_id));
|
||||
|
||||
free(host_mbcnt_lo);
|
||||
free(host_mbcnt_hi);
|
||||
free(host_lane_id);
|
||||
|
||||
REQUIRE(mbcnt_lo_errors == 0);
|
||||
REQUIRE(mbcnt_hi_errors == 0);
|
||||
REQUIRE(lane_id_errors == 0);
|
||||
}
|
||||
@@ -0,0 +1,136 @@
|
||||
/*
|
||||
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <cstdio>
|
||||
|
||||
constexpr int WIDTH = 16;
|
||||
constexpr int HEIGHT = 16;
|
||||
constexpr int NUM = WIDTH * HEIGHT;
|
||||
|
||||
constexpr int THREADS_PER_BLOCK_X = 8;
|
||||
constexpr int THREADS_PER_BLOCK_Y = 8;
|
||||
|
||||
// CPU implementation of popcount
|
||||
template <typename T> unsigned int popcountCPU(T value) {
|
||||
unsigned int ret = 0;
|
||||
while (value) {
|
||||
if (value & 0x1) ++ret;
|
||||
value >>= 1;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
__global__ void HIP_kernel(unsigned int* a, unsigned int* b, unsigned int* c,
|
||||
unsigned long long int* d, int width, int height) {
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
int i = y * width + x;
|
||||
if (i < (width * height)) {
|
||||
a[i] = __popc(b[i]);
|
||||
c[i] = __popcll(d[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
TEST_CASE("Unit_popc") {
|
||||
using namespace std;
|
||||
unsigned int* hostA;
|
||||
unsigned int* hostB;
|
||||
unsigned int* hostC;
|
||||
unsigned long long int* hostD;
|
||||
|
||||
unsigned int* deviceA;
|
||||
unsigned int* deviceB;
|
||||
unsigned int* deviceC;
|
||||
unsigned long long int* deviceD;
|
||||
|
||||
hipDeviceProp_t devProp;
|
||||
HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
|
||||
INFO("System minor : " << devProp.minor);
|
||||
INFO("System major : " << devProp.major);
|
||||
INFO("agent prop name : " << devProp.name);
|
||||
|
||||
INFO("hip Device prop succeeded");
|
||||
|
||||
|
||||
int i;
|
||||
int errors;
|
||||
|
||||
hostA = (unsigned int*)malloc(NUM * sizeof(unsigned int));
|
||||
hostB = (unsigned int*)malloc(NUM * sizeof(unsigned int));
|
||||
hostC = (unsigned int*)malloc(NUM * sizeof(unsigned int));
|
||||
hostD = (unsigned long long int*)malloc(NUM * sizeof(unsigned long long int));
|
||||
|
||||
// initialize the input data
|
||||
for (i = 0; i < NUM; i++) {
|
||||
hostB[i] = i;
|
||||
hostD[i] = 1099511627776 - i;
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)&deviceA, NUM * sizeof(unsigned int)));
|
||||
HIP_CHECK(hipMalloc((void**)&deviceB, NUM * sizeof(unsigned int)));
|
||||
HIP_CHECK(hipMalloc((void**)&deviceC, NUM * sizeof(unsigned int)));
|
||||
HIP_CHECK(hipMalloc((void**)&deviceD, NUM * sizeof(unsigned long long int)));
|
||||
|
||||
HIP_CHECK(hipMemcpy(deviceB, hostB, NUM * sizeof(unsigned int), hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(deviceD, hostD, NUM * sizeof(unsigned long long int), hipMemcpyHostToDevice));
|
||||
|
||||
|
||||
hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
|
||||
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB,
|
||||
deviceC, deviceD, WIDTH, HEIGHT);
|
||||
|
||||
|
||||
HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(unsigned int), hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(hostC, deviceC, NUM * sizeof(unsigned int), hipMemcpyDeviceToHost));
|
||||
|
||||
// verify the results
|
||||
errors = 0;
|
||||
for (i = 0; i < NUM; i++) {
|
||||
if (hostA[i] != popcountCPU(hostB[i])) {
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
|
||||
CHECK(errors == 0);
|
||||
|
||||
for (i = 0; i < NUM; i++) {
|
||||
if (hostC[i] != popcountCPU(hostD[i])) {
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
HIP_CHECK(hipFree(deviceA));
|
||||
HIP_CHECK(hipFree(deviceB));
|
||||
HIP_CHECK(hipFree(deviceC));
|
||||
HIP_CHECK(hipFree(deviceD));
|
||||
|
||||
free(hostA);
|
||||
free(hostB);
|
||||
free(hostC);
|
||||
free(hostD);
|
||||
|
||||
REQUIRE(errors == 0);
|
||||
}
|
||||
@@ -0,0 +1,127 @@
|
||||
/*
|
||||
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
|
||||
static __global__ void kernel_syncthreads_and(int* syncTestD, int* allThreadsZeroD,
|
||||
int* allThreadsOneD, int* oneThreadZeroD,
|
||||
int* allThreadsMinusOneD) {
|
||||
int blockSize = blockDim.x;
|
||||
int predicate = 10;
|
||||
// First block index starts with 0, and second block index starts
|
||||
// with blockSize
|
||||
int i = (blockIdx.x == 0) ? threadIdx.x : blockSize + threadIdx.x;
|
||||
|
||||
// At very first, we need to ensure work-group level syncronization
|
||||
// properly happened, don't bother about predicate testing for now.
|
||||
// Thread 0 and thread 1 writes to shared memory. After call to api,
|
||||
// every thread reads shared memory, and store product for verification
|
||||
__shared__ int sm[2];
|
||||
if (threadIdx.x == 0)
|
||||
sm[0] = 10;
|
||||
else if (threadIdx.x == 1)
|
||||
sm[1] = 20;
|
||||
__syncthreads_and(predicate);
|
||||
syncTestD[i] = sm[0] * sm[1];
|
||||
|
||||
// All threads pass 0 as predicate value, result should be 0
|
||||
predicate = 0;
|
||||
allThreadsZeroD[i] = __syncthreads_and(predicate);
|
||||
|
||||
// All threads pass 1 as predicate value, result should be 1
|
||||
predicate = 1;
|
||||
allThreadsOneD[i] = __syncthreads_and(predicate);
|
||||
|
||||
// Thread 0 pass 0, and all other threads 1 as predicate value,
|
||||
// result should be 0
|
||||
predicate = (threadIdx.x == 0) ? 0 : 1;
|
||||
oneThreadZeroD[i] = __syncthreads_and(predicate);
|
||||
|
||||
// All threads pass -1 as predicate value, result should be 1
|
||||
predicate = -1;
|
||||
allThreadsMinusOneD[i] = __syncthreads_and(predicate);
|
||||
}
|
||||
|
||||
static void test_syncthreads_and(int blockSize) {
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int *syncTestD, *syncTestH;
|
||||
int *allThreadsZeroD, *allThreadsZeroH;
|
||||
int *allThreadsOneD, *allThreadsOneH;
|
||||
int *oneThreadZeroD, *oneThreadZeroH;
|
||||
int *allThreadsMinusOneD, *allThreadsMinusOneH;
|
||||
|
||||
// Allocate device memory
|
||||
HIP_CHECK(hipMalloc((void**)&syncTestD, nBytes));
|
||||
HIP_CHECK(hipMalloc((void**)&allThreadsZeroD, nBytes));
|
||||
HIP_CHECK(hipMalloc((void**)&allThreadsOneD, nBytes));
|
||||
HIP_CHECK(hipMalloc((void**)&oneThreadZeroD, nBytes));
|
||||
HIP_CHECK(hipMalloc((void**)&allThreadsMinusOneD, nBytes));
|
||||
|
||||
// Allocate host memory
|
||||
HIP_CHECK(hipHostMalloc((void**)&syncTestH, nBytes));
|
||||
HIP_CHECK(hipHostMalloc((void**)&allThreadsZeroH, nBytes));
|
||||
HIP_CHECK(hipHostMalloc((void**)&allThreadsOneH, nBytes));
|
||||
HIP_CHECK(hipHostMalloc((void**)&oneThreadZeroH, nBytes));
|
||||
HIP_CHECK(hipHostMalloc((void**)&allThreadsMinusOneH, nBytes));
|
||||
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_syncthreads_and, 2, blockSize, 0, 0, syncTestD, allThreadsZeroD,
|
||||
allThreadsOneD, oneThreadZeroD, allThreadsMinusOneD);
|
||||
|
||||
// Copy result from device to host
|
||||
HIP_CHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(allThreadsZeroH, allThreadsZeroD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(allThreadsOneH, allThreadsOneD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(oneThreadZeroH, oneThreadZeroD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(allThreadsMinusOneH, allThreadsMinusOneD, nBytes, hipMemcpyDeviceToHost));
|
||||
|
||||
// Validate results for both blocks together
|
||||
for (int i = 0; i < 2 * blockSize; ++i) {
|
||||
REQUIRE(syncTestH[i] == 200);
|
||||
REQUIRE(allThreadsZeroH[i] == 0);
|
||||
REQUIRE(allThreadsOneH[i] == 1);
|
||||
REQUIRE(oneThreadZeroH[i] == 0);
|
||||
REQUIRE(allThreadsMinusOneH[i] == 1);
|
||||
}
|
||||
|
||||
// Free device memory
|
||||
HIP_CHECK(hipFree(syncTestD));
|
||||
HIP_CHECK(hipFree(allThreadsZeroD));
|
||||
HIP_CHECK(hipFree(allThreadsOneD));
|
||||
HIP_CHECK(hipFree(oneThreadZeroD));
|
||||
HIP_CHECK(hipFree(allThreadsMinusOneD));
|
||||
|
||||
// Free host memory
|
||||
HIP_CHECK(hipHostFree(syncTestH));
|
||||
HIP_CHECK(hipHostFree(allThreadsZeroH));
|
||||
HIP_CHECK(hipHostFree(allThreadsOneH));
|
||||
HIP_CHECK(hipHostFree(oneThreadZeroH));
|
||||
HIP_CHECK(hipHostFree(allThreadsMinusOneH));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_syncthreads_and") {
|
||||
int blockSizes[] = {10, 40, 70, 130, 240, 723, 32, 64, 128, 256, 512, 1024};
|
||||
for (unsigned long i = 0; i < (sizeof(blockSizes) / sizeof(blockSizes[0])); ++i)
|
||||
test_syncthreads_and(blockSizes[i]);
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
/*
|
||||
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
|
||||
static __global__ void kernel_syncthreads_count(int* syncTestD, int* allThreadsZeroD,
|
||||
int* allThreadsOneD, int* oddThreadsOneD,
|
||||
int* allThreadsMinusOneD, int* allThreadsIdD) {
|
||||
int blockSize = blockDim.x;
|
||||
int predicate = 10;
|
||||
// First block index starts with 0, and second block index starts
|
||||
// with blockSize
|
||||
int i = (blockIdx.x == 0) ? threadIdx.x : blockSize + threadIdx.x;
|
||||
|
||||
// At very first, we need to ensure work-group level syncronization
|
||||
// properly happened, don't bother about predicate testing for now.
|
||||
// Thread 0 and thread 1 writes to shared memory. After call to api,
|
||||
// every thread reads shared memory, and store sum for verification
|
||||
__shared__ int sm[2];
|
||||
if (threadIdx.x == 0)
|
||||
sm[0] = 10;
|
||||
else if (threadIdx.x == 1)
|
||||
sm[1] = 20;
|
||||
__syncthreads_count(predicate);
|
||||
syncTestD[i] = sm[0] + sm[1];
|
||||
|
||||
// All threads pass 0 as predicate value, result should be 0
|
||||
predicate = 0;
|
||||
allThreadsZeroD[i] = __syncthreads_count(predicate);
|
||||
|
||||
// All threads pass 1 as predicate value, result should be blockSize
|
||||
predicate = 1;
|
||||
allThreadsOneD[i] = __syncthreads_count(predicate);
|
||||
|
||||
// Odd numbered threads pass 1, and even numbered threads pass 0, as
|
||||
// predicate value, result should be blockSize / 2
|
||||
predicate = threadIdx.x % 2;
|
||||
oddThreadsOneD[i] = __syncthreads_count(predicate);
|
||||
|
||||
// All threads pass -1 as predicate value, result should blockSize
|
||||
predicate = -1;
|
||||
allThreadsMinusOneD[i] = __syncthreads_count(predicate);
|
||||
|
||||
// Each thread pass its ID as predicate value, result should be blockSize - 1
|
||||
predicate = threadIdx.x;
|
||||
allThreadsIdD[i] = __syncthreads_count(predicate);
|
||||
}
|
||||
|
||||
void test_syncthreads_count(int blockSize) {
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int *syncTestD, *syncTestH;
|
||||
int *allThreadsZeroD, *allThreadsZeroH;
|
||||
int *allThreadsOneD, *allThreadsOneH;
|
||||
int *oddThreadsOneD, *oddThreadsOneH;
|
||||
int *allThreadsMinusOneD, *allThreadsMinusOneH;
|
||||
int *allThreadsIdD, *allThreadsIdH;
|
||||
|
||||
// Allocate device memory
|
||||
HIP_CHECK(hipMalloc((void**)&syncTestD, nBytes));
|
||||
HIP_CHECK(hipMalloc((void**)&allThreadsZeroD, nBytes));
|
||||
HIP_CHECK(hipMalloc((void**)&allThreadsOneD, nBytes));
|
||||
HIP_CHECK(hipMalloc((void**)&oddThreadsOneD, nBytes));
|
||||
HIP_CHECK(hipMalloc((void**)&allThreadsMinusOneD, nBytes));
|
||||
HIP_CHECK(hipMalloc((void**)&allThreadsIdD, nBytes));
|
||||
|
||||
// Allocate host memory
|
||||
HIP_CHECK(hipHostMalloc((void**)&syncTestH, nBytes));
|
||||
HIP_CHECK(hipHostMalloc((void**)&allThreadsZeroH, nBytes));
|
||||
HIP_CHECK(hipHostMalloc((void**)&allThreadsOneH, nBytes));
|
||||
HIP_CHECK(hipHostMalloc((void**)&oddThreadsOneH, nBytes));
|
||||
HIP_CHECK(hipHostMalloc((void**)&allThreadsMinusOneH, nBytes));
|
||||
HIP_CHECK(hipHostMalloc((void**)&allThreadsIdH, nBytes));
|
||||
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_syncthreads_count, 2, blockSize, 0, 0, syncTestD, allThreadsZeroD,
|
||||
allThreadsOneD, oddThreadsOneD, allThreadsMinusOneD, allThreadsIdD);
|
||||
|
||||
// Copy result from device to host
|
||||
HIP_CHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(allThreadsZeroH, allThreadsZeroD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(allThreadsOneH, allThreadsOneD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(oddThreadsOneH, oddThreadsOneD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(allThreadsMinusOneH, allThreadsMinusOneD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(allThreadsIdH, allThreadsIdD, nBytes, hipMemcpyDeviceToHost));
|
||||
|
||||
// Validate results for both the blocks together
|
||||
for (int i = 0; i < 2 * blockSize; ++i) {
|
||||
REQUIRE(syncTestH[i] == 30);
|
||||
REQUIRE(allThreadsZeroH[i] == 0);
|
||||
REQUIRE(allThreadsOneH[i] == blockSize);
|
||||
REQUIRE(oddThreadsOneH[i] == blockSize / 2);
|
||||
REQUIRE(allThreadsMinusOneH[i] == blockSize);
|
||||
REQUIRE(allThreadsIdH[i] == (blockSize - 1));
|
||||
}
|
||||
|
||||
// Free device memory
|
||||
HIP_CHECK(hipFree(syncTestD));
|
||||
HIP_CHECK(hipFree(allThreadsZeroD));
|
||||
HIP_CHECK(hipFree(allThreadsOneD));
|
||||
HIP_CHECK(hipFree(oddThreadsOneD));
|
||||
HIP_CHECK(hipFree(allThreadsMinusOneD));
|
||||
HIP_CHECK(hipFree(allThreadsIdD));
|
||||
|
||||
// Free host memory
|
||||
HIP_CHECK(hipHostFree(syncTestH));
|
||||
HIP_CHECK(hipHostFree(allThreadsZeroH));
|
||||
HIP_CHECK(hipHostFree(allThreadsOneH));
|
||||
HIP_CHECK(hipHostFree(oddThreadsOneH));
|
||||
HIP_CHECK(hipHostFree(allThreadsMinusOneH));
|
||||
HIP_CHECK(hipHostFree(allThreadsIdH));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_syncthreads_count") {
|
||||
int blockSizes[] = {10, 40, 70, 130, 240, 723, 32, 64, 128, 256, 512, 1024};
|
||||
for (unsigned long i = 0; i < (sizeof(blockSizes) / sizeof(blockSizes[0])); ++i)
|
||||
test_syncthreads_count(blockSizes[i]);
|
||||
}
|
||||
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
|
||||
static __global__ void kernel_syncthreads_or(int* syncTestD, int* allThreadsZeroD,
|
||||
int* allThreadsOneD, int* oneThreadOneD,
|
||||
int* allThreadsMinusOneD) {
|
||||
int blockSize = blockDim.x;
|
||||
int predicate = 10;
|
||||
// First block index starts with 0, and second block index starts
|
||||
// with blockSize
|
||||
int i = (blockIdx.x == 0) ? threadIdx.x : blockSize + threadIdx.x;
|
||||
|
||||
// At very first, we need to ensure work-group level syncronization
|
||||
// properly happened, don't bother about predicate testing for now.
|
||||
// Thread 0 and thread 1 writes to shared memory. After call to api,
|
||||
// every thread reads shared memory, and store subtraction for verification
|
||||
__shared__ int sm[2];
|
||||
if (threadIdx.x == 0)
|
||||
sm[0] = 10;
|
||||
else if (threadIdx.x == 1)
|
||||
sm[1] = 20;
|
||||
__syncthreads_or(predicate);
|
||||
syncTestD[i] = sm[1] - sm[0];
|
||||
|
||||
// All threads pass 0 as predicate value, result should be 0
|
||||
predicate = 0;
|
||||
allThreadsZeroD[i] = __syncthreads_or(predicate);
|
||||
|
||||
// All threads pass 1 as predicate value, result should be 1
|
||||
predicate = 1;
|
||||
allThreadsOneD[i] = __syncthreads_or(predicate);
|
||||
|
||||
// Thread 0 pass 1, and all other threads 0 as predicate value,
|
||||
// result should be 1
|
||||
predicate = (threadIdx.x == 0) ? 1 : 0;
|
||||
oneThreadOneD[i] = __syncthreads_or(predicate);
|
||||
|
||||
// All threads pass -1 as predicate value, result should be 1
|
||||
predicate = -1;
|
||||
allThreadsMinusOneD[i] = __syncthreads_or(predicate);
|
||||
}
|
||||
|
||||
static void test_syncthreads_or(int blockSize) {
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int *syncTestD, *syncTestH;
|
||||
int *allThreadsZeroD, *allThreadsZeroH;
|
||||
int *allThreadsOneD, *allThreadsOneH;
|
||||
int *oneThreadOneD, *oneThreadOneH;
|
||||
int *allThreadsMinusOneD, *allThreadsMinusOneH;
|
||||
|
||||
// Allocate device memory
|
||||
HIP_CHECK(hipMalloc((void**)&syncTestD, nBytes));
|
||||
HIP_CHECK(hipMalloc((void**)&allThreadsZeroD, nBytes));
|
||||
HIP_CHECK(hipMalloc((void**)&allThreadsOneD, nBytes));
|
||||
HIP_CHECK(hipMalloc((void**)&oneThreadOneD, nBytes));
|
||||
HIP_CHECK(hipMalloc((void**)&allThreadsMinusOneD, nBytes));
|
||||
|
||||
// Allocate host memory
|
||||
HIP_CHECK(hipHostMalloc((void**)&syncTestH, nBytes));
|
||||
HIP_CHECK(hipHostMalloc((void**)&allThreadsZeroH, nBytes));
|
||||
HIP_CHECK(hipHostMalloc((void**)&allThreadsOneH, nBytes));
|
||||
HIP_CHECK(hipHostMalloc((void**)&oneThreadOneH, nBytes));
|
||||
HIP_CHECK(hipHostMalloc((void**)&allThreadsMinusOneH, nBytes));
|
||||
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_syncthreads_or, 2, blockSize, 0, 0, syncTestD, allThreadsZeroD,
|
||||
allThreadsOneD, oneThreadOneD, allThreadsMinusOneD);
|
||||
|
||||
// Copy result from device to host
|
||||
HIP_CHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(allThreadsZeroH, allThreadsZeroD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(allThreadsOneH, allThreadsOneD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(oneThreadOneH, oneThreadOneD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(allThreadsMinusOneH, allThreadsMinusOneD, nBytes, hipMemcpyDeviceToHost));
|
||||
|
||||
// Validate results for both blocks together
|
||||
for (int i = 0; i < 2 * blockSize; ++i) {
|
||||
REQUIRE(syncTestH[i] == 10);
|
||||
REQUIRE(allThreadsZeroH[i] == 0);
|
||||
REQUIRE(allThreadsOneH[i] == 1);
|
||||
REQUIRE(oneThreadOneH[i] == 1);
|
||||
REQUIRE(allThreadsMinusOneH[i] == 1);
|
||||
}
|
||||
|
||||
// Free device memory
|
||||
HIP_CHECK(hipFree(syncTestD));
|
||||
HIP_CHECK(hipFree(allThreadsZeroD));
|
||||
HIP_CHECK(hipFree(allThreadsOneD));
|
||||
HIP_CHECK(hipFree(oneThreadOneD));
|
||||
HIP_CHECK(hipFree(allThreadsMinusOneD));
|
||||
|
||||
// Free host memory
|
||||
HIP_CHECK(hipHostFree(syncTestH));
|
||||
HIP_CHECK(hipHostFree(allThreadsZeroH));
|
||||
HIP_CHECK(hipHostFree(allThreadsOneH));
|
||||
HIP_CHECK(hipHostFree(oneThreadOneH));
|
||||
HIP_CHECK(hipHostFree(allThreadsMinusOneH));
|
||||
}
|
||||
|
||||
|
||||
TEST_CASE("Unit_syncthreads_or") {
|
||||
int blockSizes[] = {10, 40, 70, 130, 240, 723, 32, 64, 128, 256, 512, 1024};
|
||||
for (unsigned long i = 0; i < (sizeof(blockSizes) / sizeof(blockSizes[0])); ++i)
|
||||
test_syncthreads_or(blockSizes[i]);
|
||||
}
|
||||
@@ -0,0 +1,112 @@
|
||||
/*
|
||||
Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <thread>
|
||||
|
||||
__host__ __device__ void fence_system() {
|
||||
#ifdef __HIP_DEVICE_COMPILE__
|
||||
__threadfence_system();
|
||||
#else
|
||||
std::atomic_thread_fence(std::memory_order_seq_cst);
|
||||
#endif
|
||||
}
|
||||
|
||||
__host__ __device__ void round_robin(const int id, const int num_dev, const int num_iter,
|
||||
volatile int* data, volatile int* flag) {
|
||||
for (int i = 0; i < num_iter; i++) {
|
||||
while (*flag % num_dev != id) fence_system(); // invalid the cache for read
|
||||
|
||||
(*data)++;
|
||||
fence_system(); // make sure the store to data is sequenced before the store to flag
|
||||
(*flag)++;
|
||||
fence_system(); // invalid the cache to flush out flag
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void gpu_round_robin(const int id, const int num_dev, const int num_iter,
|
||||
volatile int* data, volatile int* flag) {
|
||||
round_robin(id, num_dev, num_iter, data, flag);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_threadfence_system") {
|
||||
int num_gpus = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&num_gpus));
|
||||
REQUIRE(num_gpus > 0);
|
||||
|
||||
volatile int* data;
|
||||
if (hipHostMalloc(&data, sizeof(int), hipHostMallocCoherent) != hipSuccess) {
|
||||
SUCCEED("Memory allocation failed. Skip test. Is SVM atomic supported?");
|
||||
}
|
||||
|
||||
constexpr int init_data = 1000;
|
||||
*data = init_data;
|
||||
|
||||
volatile int* flag;
|
||||
if (hipHostMalloc(&flag, sizeof(int), hipHostMallocCoherent) != hipSuccess) {
|
||||
SUCCEED("Memory allocation failed. Skip test. Is SVM atomic supported?");
|
||||
}
|
||||
*flag = 0;
|
||||
|
||||
// number of rounds per device
|
||||
constexpr int num_iter = 1000;
|
||||
|
||||
// one CPU thread + 1 kernel/GPU
|
||||
const int num_dev = num_gpus + 1;
|
||||
|
||||
int next_id = 0;
|
||||
std::vector<std::thread> threads;
|
||||
|
||||
// create a CPU thread for the round_robin
|
||||
threads.push_back(std::thread(round_robin, next_id++, num_dev, num_iter, data, flag));
|
||||
|
||||
// run one thread per GPU
|
||||
dim3 dim_block(1, 1, 1);
|
||||
dim3 dim_grid(1, 1, 1);
|
||||
|
||||
// launch one kernel per device for the round robin
|
||||
for (; next_id < num_dev; ++next_id) {
|
||||
threads.push_back(std::thread([=]() {
|
||||
HIP_CHECK(hipSetDevice(next_id - 1));
|
||||
hipLaunchKernelGGL(gpu_round_robin, dim_grid, dim_block, 0, 0x0, next_id, num_dev, num_iter,
|
||||
data, flag);
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}));
|
||||
}
|
||||
|
||||
for (auto& t : threads) {
|
||||
t.join();
|
||||
}
|
||||
|
||||
int expected_data = init_data + num_dev * num_iter;
|
||||
int expected_flag = num_dev * num_iter;
|
||||
|
||||
bool passed = *data == expected_data && *flag == expected_flag;
|
||||
|
||||
HIP_CHECK(hipHostFree((void*)data));
|
||||
HIP_CHECK(hipHostFree((void*)flag));
|
||||
|
||||
REQUIRE(passed == true);
|
||||
}
|
||||
Ссылка в новой задаче
Block a user