diff --git a/catch/unit/deviceLib/CMakeLists.txt b/catch/unit/deviceLib/CMakeLists.txt index 357a68e4a3..ce11801786 100644 --- a/catch/unit/deviceLib/CMakeLists.txt +++ b/catch/unit/deviceLib/CMakeLists.txt @@ -7,11 +7,18 @@ set(TEST_SRC ffs.cc funnelshift.cc brev.cc + popc.cc + ldg.cc + syncthreadsand.cc + syncthreadscount.cc + syncthreadsor.cc + threadfence_system.cc ) # AMD only tests set(AMD_TEST_SRC vectorTypesDevice.cc + mbcnt.cc bitExtract.cc bitInsert.cc floatTM.cc diff --git a/catch/unit/deviceLib/ldg.cc b/catch/unit/deviceLib/ldg.cc new file mode 100644 index 0000000000..61456ee940 --- /dev/null +++ b/catch/unit/deviceLib/ldg.cc @@ -0,0 +1,250 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + + +constexpr int WIDTH = 16; +constexpr int HEIGHT = 16; +constexpr int NUM = WIDTH * HEIGHT; + +constexpr int THREADS_PER_BLOCK_X = 8; +constexpr int THREADS_PER_BLOCK_Y = 8; + +template +__global__ void vectoradd_float(T* a, const T* bm, int width, int height) + +{ + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; + + int i = y * width + x; + if (i < (width * height)) { + a[i] = __ldg(&bm[i]); + } +} + +int2 make_vector2(int a) { return make_int2(a, a); } + +char2 make_vector2(signed char a) { return make_char2(a, a); } + +char4 make_vector4(signed char a) { return make_char4(a, a, a, a); } + +short2 make_vector2(short a) { return make_short2(a, a); } + +ushort2 make_vector2(unsigned short a) { return make_ushort2(a, a); } + +short4 make_vector4(short a) { return make_short4(a, a, a, a); } + +int4 make_vector4(int a) { return make_int4(a, a, a, a); } + +uint2 make_vector2(unsigned int a) { return make_uint2(a, a); } + +uint4 make_vector4(unsigned int a) { return make_uint4(a, a, a, a); } + +float2 make_vector2(float a) { return make_float2(a, a); } + +float4 make_vector4(float a) { return make_float4(a, a, a, a); } + +uchar2 make_vector2(unsigned char a) { return make_uchar2(a, a); } + +uchar4 make_vector4(unsigned char a) { return make_uchar4(a, a, a, a); } + +double2 make_vector2(double a) { return make_double2(a, a); } + + +template int dataTypesRun() { + T* hostA; + T* hostB; + + T* deviceA; + T* deviceB; + + int i; + int errors; + + hostA = (T*)malloc(NUM * sizeof(T)); + hostB = (T*)malloc(NUM * sizeof(T)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + hostB[i] = (U)i; + } + + HIP_CHECK(hipMalloc((void**)&deviceA, NUM * sizeof(T))); + HIP_CHECK(hipMalloc((void**)&deviceB, NUM * sizeof(T))); + + HIP_CHECK(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice)); + + + hipLaunchKernelGGL(vectoradd_float, + dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, + static_cast(deviceB), WIDTH, HEIGHT); + + + HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(T), hipMemcpyDeviceToHost)); + + // verify the results + errors = 0; + for (i = 0; i < NUM; i++) { + if (hostA[i] != (hostB[i])) { + errors++; + } + } + + HIP_CHECK(hipFree(deviceA)); + HIP_CHECK(hipFree(deviceB)); + + free(hostA); + free(hostB); + + return errors; +} + + +template int dataTypesRun2() { + T* hostA; + T* hostB; + + T* deviceA; + T* deviceB; + + int i; + int errors; + + hostA = (T*)malloc(NUM * sizeof(T)); + hostB = (T*)malloc(NUM * sizeof(T)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + hostB[i] = make_vector2((U)i); + } + + HIP_CHECK(hipMalloc((void**)&deviceA, NUM * sizeof(T))); + HIP_CHECK(hipMalloc((void**)&deviceB, NUM * sizeof(T))); + + HIP_CHECK(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice)); + hipLaunchKernelGGL(vectoradd_float, + dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, + static_cast(deviceB), WIDTH, HEIGHT); + + + HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(T), hipMemcpyDeviceToHost)); + + // verify the results + errors = 0; + for (i = 0; i < NUM; i++) { + if (hostA[i].x != (hostB[i].x) && hostA[i].y != (hostB[i].y)) { + errors++; + } + } + + HIP_CHECK(hipFree(deviceA)); + HIP_CHECK(hipFree(deviceB)); + + free(hostA); + free(hostB); + + return errors; +} + + +template int dataTypesRun4() { + T* hostA; + T* hostB; + + T* deviceA; + T* deviceB; + + int i; + int errors; + + hostA = (T*)malloc(NUM * sizeof(T)); + hostB = (T*)malloc(NUM * sizeof(T)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + hostB[i] = make_vector4((U)i); + } + + HIP_CHECK(hipMalloc((void**)&deviceA, NUM * sizeof(T))); + HIP_CHECK(hipMalloc((void**)&deviceB, NUM * sizeof(T))); + + HIP_CHECK(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice)); + + + hipLaunchKernelGGL(vectoradd_float, + dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, + static_cast(deviceB), WIDTH, HEIGHT); + + + HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(T), hipMemcpyDeviceToHost)); + + // verify the results + errors = 0; + for (i = 0; i < NUM; i++) { + if (hostA[i].x != (hostB[i].x) && hostA[i].y != (hostB[i].y) && hostA[i].z != (hostB[i].z) && + hostA[i].w != (hostB[i].w)) { + errors++; + } + } + + HIP_CHECK(hipFree(deviceA)); + HIP_CHECK(hipFree(deviceB)); + + free(hostA); + free(hostB); + + return errors; +} + +TEST_CASE("Unit_ldg") { + using namespace std; + + int errors; + + errors = dataTypesRun() | dataTypesRun() | dataTypesRun() | + dataTypesRun() | dataTypesRun() | + dataTypesRun() | dataTypesRun() | + dataTypesRun() | dataTypesRun() | + dataTypesRun() | + dataTypesRun() | dataTypesRun() | + dataTypesRun(); + + REQUIRE(errors == 0); + + errors = dataTypesRun2() | dataTypesRun2() | + dataTypesRun2() | dataTypesRun2() | + dataTypesRun2() | dataTypesRun2() | + dataTypesRun2() | dataTypesRun2(); + + REQUIRE(errors == 0); + + errors = dataTypesRun4() | dataTypesRun4() | + dataTypesRun4() | dataTypesRun4() | + dataTypesRun4() | dataTypesRun4(); + + REQUIRE(errors == 0); +} \ No newline at end of file diff --git a/catch/unit/deviceLib/mbcnt.cc b/catch/unit/deviceLib/mbcnt.cc new file mode 100644 index 0000000000..eeeb0f1c54 --- /dev/null +++ b/catch/unit/deviceLib/mbcnt.cc @@ -0,0 +1,103 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +#include +#include +#include +#include +#include + +__global__ void HIP_kernel(unsigned int* mbcnt_lo, unsigned int* mbcnt_hi, unsigned int* lane_id) { + int x = blockDim.x * blockIdx.x + threadIdx.x; + mbcnt_lo[x] = __builtin_amdgcn_mbcnt_lo(0xFFFFFFFF, 0); + mbcnt_hi[x] = __builtin_amdgcn_mbcnt_hi(0xFFFFFFFF, 0); + lane_id[x] = __lane_id(); +} + + +TEST_CASE("Unit_mbcnt") { + using namespace std; + unsigned int* device_mbcnt_lo; + unsigned int* device_mbcnt_hi; + unsigned int* device_lane_id; + + hipDeviceProp_t devProp; + HIP_CHECK(hipGetDeviceProperties(&devProp, 0)); + INFO("System minor : " << devProp.minor); + INFO("System major : " << devProp.major); + INFO("agent prop name : " << devProp.name); + + INFO("hip Device prop succeeded"); + + constexpr unsigned int num_waves_per_block = 2; + const unsigned int wave_size = devProp.warpSize; + const unsigned int num_threads_per_block = wave_size * num_waves_per_block; + const unsigned int num_blocks = 2; + const unsigned int num_threads = num_threads_per_block * num_blocks; + const size_t buffer_size = num_threads * sizeof(unsigned int); + + HIP_CHECK(hipMalloc((void**)&device_mbcnt_lo, buffer_size)); + HIP_CHECK(hipMalloc((void**)&device_mbcnt_hi, buffer_size)); + HIP_CHECK(hipMalloc((void**)&device_lane_id, buffer_size)); + + hipLaunchKernelGGL(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block), 0, 0, + device_mbcnt_lo, device_mbcnt_hi, device_lane_id); + + unsigned int* host_mbcnt_lo = (unsigned int*)malloc(buffer_size); + unsigned int* host_mbcnt_hi = (unsigned int*)malloc(buffer_size); + unsigned int* host_lane_id = (unsigned int*)malloc(buffer_size); + + HIP_CHECK(hipMemcpy(host_mbcnt_lo, device_mbcnt_lo, buffer_size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_mbcnt_hi, device_mbcnt_hi, buffer_size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(host_lane_id, device_lane_id, buffer_size, hipMemcpyDeviceToHost)); + + // verify the results + int mbcnt_lo_errors = 0; + int mbcnt_hi_errors = 0; + int lane_id_errors = 0; + for (unsigned int i = 0; i < num_threads; i++) { + unsigned int this_lane_id = i % wave_size; + unsigned int this_mbcnt_lo = this_lane_id >= 32 ? 32 : this_lane_id; + unsigned int this_mbcnt_hi = this_lane_id < 32 ? 0 : (this_lane_id - 32); + + if (host_mbcnt_lo[i] != this_mbcnt_lo) mbcnt_lo_errors++; + + if (host_mbcnt_hi[i] != this_mbcnt_hi) mbcnt_hi_errors++; + + if (host_lane_id[i] != this_lane_id) lane_id_errors++; + } + + + HIP_CHECK(hipFree(device_mbcnt_lo)); + HIP_CHECK(hipFree(device_mbcnt_hi)); + HIP_CHECK(hipFree(device_lane_id)); + + free(host_mbcnt_lo); + free(host_mbcnt_hi); + free(host_lane_id); + + REQUIRE(mbcnt_lo_errors == 0); + REQUIRE(mbcnt_hi_errors == 0); + REQUIRE(lane_id_errors == 0); +} \ No newline at end of file diff --git a/catch/unit/deviceLib/popc.cc b/catch/unit/deviceLib/popc.cc new file mode 100644 index 0000000000..e84d715ede --- /dev/null +++ b/catch/unit/deviceLib/popc.cc @@ -0,0 +1,136 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +constexpr int WIDTH = 16; +constexpr int HEIGHT = 16; +constexpr int NUM = WIDTH * HEIGHT; + +constexpr int THREADS_PER_BLOCK_X = 8; +constexpr int THREADS_PER_BLOCK_Y = 8; + +// CPU implementation of popcount +template unsigned int popcountCPU(T value) { + unsigned int ret = 0; + while (value) { + if (value & 0x1) ++ret; + value >>= 1; + } + return ret; +} + +__global__ void HIP_kernel(unsigned int* a, unsigned int* b, unsigned int* c, + unsigned long long int* d, int width, int height) { + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; + + int i = y * width + x; + if (i < (width * height)) { + a[i] = __popc(b[i]); + c[i] = __popcll(d[i]); + } +} + + +TEST_CASE("Unit_popc") { + using namespace std; + unsigned int* hostA; + unsigned int* hostB; + unsigned int* hostC; + unsigned long long int* hostD; + + unsigned int* deviceA; + unsigned int* deviceB; + unsigned int* deviceC; + unsigned long long int* deviceD; + + hipDeviceProp_t devProp; + HIP_CHECK(hipGetDeviceProperties(&devProp, 0)); + INFO("System minor : " << devProp.minor); + INFO("System major : " << devProp.major); + INFO("agent prop name : " << devProp.name); + + INFO("hip Device prop succeeded"); + + + int i; + int errors; + + hostA = (unsigned int*)malloc(NUM * sizeof(unsigned int)); + hostB = (unsigned int*)malloc(NUM * sizeof(unsigned int)); + hostC = (unsigned int*)malloc(NUM * sizeof(unsigned int)); + hostD = (unsigned long long int*)malloc(NUM * sizeof(unsigned long long int)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + hostB[i] = i; + hostD[i] = 1099511627776 - i; + } + + HIP_CHECK(hipMalloc((void**)&deviceA, NUM * sizeof(unsigned int))); + HIP_CHECK(hipMalloc((void**)&deviceB, NUM * sizeof(unsigned int))); + HIP_CHECK(hipMalloc((void**)&deviceC, NUM * sizeof(unsigned int))); + HIP_CHECK(hipMalloc((void**)&deviceD, NUM * sizeof(unsigned long long int))); + + HIP_CHECK(hipMemcpy(deviceB, hostB, NUM * sizeof(unsigned int), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(deviceD, hostD, NUM * sizeof(unsigned long long int), hipMemcpyHostToDevice)); + + + hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB, + deviceC, deviceD, WIDTH, HEIGHT); + + + HIP_CHECK(hipMemcpy(hostA, deviceA, NUM * sizeof(unsigned int), hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(hostC, deviceC, NUM * sizeof(unsigned int), hipMemcpyDeviceToHost)); + + // verify the results + errors = 0; + for (i = 0; i < NUM; i++) { + if (hostA[i] != popcountCPU(hostB[i])) { + errors++; + } + } + + CHECK(errors == 0); + + for (i = 0; i < NUM; i++) { + if (hostC[i] != popcountCPU(hostD[i])) { + errors++; + } + } + + + HIP_CHECK(hipFree(deviceA)); + HIP_CHECK(hipFree(deviceB)); + HIP_CHECK(hipFree(deviceC)); + HIP_CHECK(hipFree(deviceD)); + + free(hostA); + free(hostB); + free(hostC); + free(hostD); + + REQUIRE(errors == 0); +} \ No newline at end of file diff --git a/catch/unit/deviceLib/syncthreadsand.cc b/catch/unit/deviceLib/syncthreadsand.cc new file mode 100644 index 0000000000..83c2b21f8f --- /dev/null +++ b/catch/unit/deviceLib/syncthreadsand.cc @@ -0,0 +1,127 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + + +static __global__ void kernel_syncthreads_and(int* syncTestD, int* allThreadsZeroD, + int* allThreadsOneD, int* oneThreadZeroD, + int* allThreadsMinusOneD) { + int blockSize = blockDim.x; + int predicate = 10; + // First block index starts with 0, and second block index starts + // with blockSize + int i = (blockIdx.x == 0) ? threadIdx.x : blockSize + threadIdx.x; + + // At very first, we need to ensure work-group level syncronization + // properly happened, don't bother about predicate testing for now. + // Thread 0 and thread 1 writes to shared memory. After call to api, + // every thread reads shared memory, and store product for verification + __shared__ int sm[2]; + if (threadIdx.x == 0) + sm[0] = 10; + else if (threadIdx.x == 1) + sm[1] = 20; + __syncthreads_and(predicate); + syncTestD[i] = sm[0] * sm[1]; + + // All threads pass 0 as predicate value, result should be 0 + predicate = 0; + allThreadsZeroD[i] = __syncthreads_and(predicate); + + // All threads pass 1 as predicate value, result should be 1 + predicate = 1; + allThreadsOneD[i] = __syncthreads_and(predicate); + + // Thread 0 pass 0, and all other threads 1 as predicate value, + // result should be 0 + predicate = (threadIdx.x == 0) ? 0 : 1; + oneThreadZeroD[i] = __syncthreads_and(predicate); + + // All threads pass -1 as predicate value, result should be 1 + predicate = -1; + allThreadsMinusOneD[i] = __syncthreads_and(predicate); +} + +static void test_syncthreads_and(int blockSize) { + int nBytes = sizeof(int) * 2 * blockSize; + int *syncTestD, *syncTestH; + int *allThreadsZeroD, *allThreadsZeroH; + int *allThreadsOneD, *allThreadsOneH; + int *oneThreadZeroD, *oneThreadZeroH; + int *allThreadsMinusOneD, *allThreadsMinusOneH; + + // Allocate device memory + HIP_CHECK(hipMalloc((void**)&syncTestD, nBytes)); + HIP_CHECK(hipMalloc((void**)&allThreadsZeroD, nBytes)); + HIP_CHECK(hipMalloc((void**)&allThreadsOneD, nBytes)); + HIP_CHECK(hipMalloc((void**)&oneThreadZeroD, nBytes)); + HIP_CHECK(hipMalloc((void**)&allThreadsMinusOneD, nBytes)); + + // Allocate host memory + HIP_CHECK(hipHostMalloc((void**)&syncTestH, nBytes)); + HIP_CHECK(hipHostMalloc((void**)&allThreadsZeroH, nBytes)); + HIP_CHECK(hipHostMalloc((void**)&allThreadsOneH, nBytes)); + HIP_CHECK(hipHostMalloc((void**)&oneThreadZeroH, nBytes)); + HIP_CHECK(hipHostMalloc((void**)&allThreadsMinusOneH, nBytes)); + + // Launch Kernel + hipLaunchKernelGGL(kernel_syncthreads_and, 2, blockSize, 0, 0, syncTestD, allThreadsZeroD, + allThreadsOneD, oneThreadZeroD, allThreadsMinusOneD); + + // Copy result from device to host + HIP_CHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(allThreadsZeroH, allThreadsZeroD, nBytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(allThreadsOneH, allThreadsOneD, nBytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(oneThreadZeroH, oneThreadZeroD, nBytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(allThreadsMinusOneH, allThreadsMinusOneD, nBytes, hipMemcpyDeviceToHost)); + + // Validate results for both blocks together + for (int i = 0; i < 2 * blockSize; ++i) { + REQUIRE(syncTestH[i] == 200); + REQUIRE(allThreadsZeroH[i] == 0); + REQUIRE(allThreadsOneH[i] == 1); + REQUIRE(oneThreadZeroH[i] == 0); + REQUIRE(allThreadsMinusOneH[i] == 1); + } + + // Free device memory + HIP_CHECK(hipFree(syncTestD)); + HIP_CHECK(hipFree(allThreadsZeroD)); + HIP_CHECK(hipFree(allThreadsOneD)); + HIP_CHECK(hipFree(oneThreadZeroD)); + HIP_CHECK(hipFree(allThreadsMinusOneD)); + + // Free host memory + HIP_CHECK(hipHostFree(syncTestH)); + HIP_CHECK(hipHostFree(allThreadsZeroH)); + HIP_CHECK(hipHostFree(allThreadsOneH)); + HIP_CHECK(hipHostFree(oneThreadZeroH)); + HIP_CHECK(hipHostFree(allThreadsMinusOneH)); +} + +TEST_CASE("Unit_syncthreads_and") { + int blockSizes[] = {10, 40, 70, 130, 240, 723, 32, 64, 128, 256, 512, 1024}; + for (unsigned long i = 0; i < (sizeof(blockSizes) / sizeof(blockSizes[0])); ++i) + test_syncthreads_and(blockSizes[i]); +} diff --git a/catch/unit/deviceLib/syncthreadscount.cc b/catch/unit/deviceLib/syncthreadscount.cc new file mode 100644 index 0000000000..ec8140ccac --- /dev/null +++ b/catch/unit/deviceLib/syncthreadscount.cc @@ -0,0 +1,138 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + + +static __global__ void kernel_syncthreads_count(int* syncTestD, int* allThreadsZeroD, + int* allThreadsOneD, int* oddThreadsOneD, + int* allThreadsMinusOneD, int* allThreadsIdD) { + int blockSize = blockDim.x; + int predicate = 10; + // First block index starts with 0, and second block index starts + // with blockSize + int i = (blockIdx.x == 0) ? threadIdx.x : blockSize + threadIdx.x; + + // At very first, we need to ensure work-group level syncronization + // properly happened, don't bother about predicate testing for now. + // Thread 0 and thread 1 writes to shared memory. After call to api, + // every thread reads shared memory, and store sum for verification + __shared__ int sm[2]; + if (threadIdx.x == 0) + sm[0] = 10; + else if (threadIdx.x == 1) + sm[1] = 20; + __syncthreads_count(predicate); + syncTestD[i] = sm[0] + sm[1]; + + // All threads pass 0 as predicate value, result should be 0 + predicate = 0; + allThreadsZeroD[i] = __syncthreads_count(predicate); + + // All threads pass 1 as predicate value, result should be blockSize + predicate = 1; + allThreadsOneD[i] = __syncthreads_count(predicate); + + // Odd numbered threads pass 1, and even numbered threads pass 0, as + // predicate value, result should be blockSize / 2 + predicate = threadIdx.x % 2; + oddThreadsOneD[i] = __syncthreads_count(predicate); + + // All threads pass -1 as predicate value, result should blockSize + predicate = -1; + allThreadsMinusOneD[i] = __syncthreads_count(predicate); + + // Each thread pass its ID as predicate value, result should be blockSize - 1 + predicate = threadIdx.x; + allThreadsIdD[i] = __syncthreads_count(predicate); +} + +void test_syncthreads_count(int blockSize) { + int nBytes = sizeof(int) * 2 * blockSize; + int *syncTestD, *syncTestH; + int *allThreadsZeroD, *allThreadsZeroH; + int *allThreadsOneD, *allThreadsOneH; + int *oddThreadsOneD, *oddThreadsOneH; + int *allThreadsMinusOneD, *allThreadsMinusOneH; + int *allThreadsIdD, *allThreadsIdH; + + // Allocate device memory + HIP_CHECK(hipMalloc((void**)&syncTestD, nBytes)); + HIP_CHECK(hipMalloc((void**)&allThreadsZeroD, nBytes)); + HIP_CHECK(hipMalloc((void**)&allThreadsOneD, nBytes)); + HIP_CHECK(hipMalloc((void**)&oddThreadsOneD, nBytes)); + HIP_CHECK(hipMalloc((void**)&allThreadsMinusOneD, nBytes)); + HIP_CHECK(hipMalloc((void**)&allThreadsIdD, nBytes)); + + // Allocate host memory + HIP_CHECK(hipHostMalloc((void**)&syncTestH, nBytes)); + HIP_CHECK(hipHostMalloc((void**)&allThreadsZeroH, nBytes)); + HIP_CHECK(hipHostMalloc((void**)&allThreadsOneH, nBytes)); + HIP_CHECK(hipHostMalloc((void**)&oddThreadsOneH, nBytes)); + HIP_CHECK(hipHostMalloc((void**)&allThreadsMinusOneH, nBytes)); + HIP_CHECK(hipHostMalloc((void**)&allThreadsIdH, nBytes)); + + // Launch Kernel + hipLaunchKernelGGL(kernel_syncthreads_count, 2, blockSize, 0, 0, syncTestD, allThreadsZeroD, + allThreadsOneD, oddThreadsOneD, allThreadsMinusOneD, allThreadsIdD); + + // Copy result from device to host + HIP_CHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(allThreadsZeroH, allThreadsZeroD, nBytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(allThreadsOneH, allThreadsOneD, nBytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(oddThreadsOneH, oddThreadsOneD, nBytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(allThreadsMinusOneH, allThreadsMinusOneD, nBytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(allThreadsIdH, allThreadsIdD, nBytes, hipMemcpyDeviceToHost)); + + // Validate results for both the blocks together + for (int i = 0; i < 2 * blockSize; ++i) { + REQUIRE(syncTestH[i] == 30); + REQUIRE(allThreadsZeroH[i] == 0); + REQUIRE(allThreadsOneH[i] == blockSize); + REQUIRE(oddThreadsOneH[i] == blockSize / 2); + REQUIRE(allThreadsMinusOneH[i] == blockSize); + REQUIRE(allThreadsIdH[i] == (blockSize - 1)); + } + + // Free device memory + HIP_CHECK(hipFree(syncTestD)); + HIP_CHECK(hipFree(allThreadsZeroD)); + HIP_CHECK(hipFree(allThreadsOneD)); + HIP_CHECK(hipFree(oddThreadsOneD)); + HIP_CHECK(hipFree(allThreadsMinusOneD)); + HIP_CHECK(hipFree(allThreadsIdD)); + + // Free host memory + HIP_CHECK(hipHostFree(syncTestH)); + HIP_CHECK(hipHostFree(allThreadsZeroH)); + HIP_CHECK(hipHostFree(allThreadsOneH)); + HIP_CHECK(hipHostFree(oddThreadsOneH)); + HIP_CHECK(hipHostFree(allThreadsMinusOneH)); + HIP_CHECK(hipHostFree(allThreadsIdH)); +} + +TEST_CASE("Unit_syncthreads_count") { + int blockSizes[] = {10, 40, 70, 130, 240, 723, 32, 64, 128, 256, 512, 1024}; + for (unsigned long i = 0; i < (sizeof(blockSizes) / sizeof(blockSizes[0])); ++i) + test_syncthreads_count(blockSizes[i]); +} diff --git a/catch/unit/deviceLib/syncthreadsor.cc b/catch/unit/deviceLib/syncthreadsor.cc new file mode 100644 index 0000000000..5b67432357 --- /dev/null +++ b/catch/unit/deviceLib/syncthreadsor.cc @@ -0,0 +1,128 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + + +static __global__ void kernel_syncthreads_or(int* syncTestD, int* allThreadsZeroD, + int* allThreadsOneD, int* oneThreadOneD, + int* allThreadsMinusOneD) { + int blockSize = blockDim.x; + int predicate = 10; + // First block index starts with 0, and second block index starts + // with blockSize + int i = (blockIdx.x == 0) ? threadIdx.x : blockSize + threadIdx.x; + + // At very first, we need to ensure work-group level syncronization + // properly happened, don't bother about predicate testing for now. + // Thread 0 and thread 1 writes to shared memory. After call to api, + // every thread reads shared memory, and store subtraction for verification + __shared__ int sm[2]; + if (threadIdx.x == 0) + sm[0] = 10; + else if (threadIdx.x == 1) + sm[1] = 20; + __syncthreads_or(predicate); + syncTestD[i] = sm[1] - sm[0]; + + // All threads pass 0 as predicate value, result should be 0 + predicate = 0; + allThreadsZeroD[i] = __syncthreads_or(predicate); + + // All threads pass 1 as predicate value, result should be 1 + predicate = 1; + allThreadsOneD[i] = __syncthreads_or(predicate); + + // Thread 0 pass 1, and all other threads 0 as predicate value, + // result should be 1 + predicate = (threadIdx.x == 0) ? 1 : 0; + oneThreadOneD[i] = __syncthreads_or(predicate); + + // All threads pass -1 as predicate value, result should be 1 + predicate = -1; + allThreadsMinusOneD[i] = __syncthreads_or(predicate); +} + +static void test_syncthreads_or(int blockSize) { + int nBytes = sizeof(int) * 2 * blockSize; + int *syncTestD, *syncTestH; + int *allThreadsZeroD, *allThreadsZeroH; + int *allThreadsOneD, *allThreadsOneH; + int *oneThreadOneD, *oneThreadOneH; + int *allThreadsMinusOneD, *allThreadsMinusOneH; + + // Allocate device memory + HIP_CHECK(hipMalloc((void**)&syncTestD, nBytes)); + HIP_CHECK(hipMalloc((void**)&allThreadsZeroD, nBytes)); + HIP_CHECK(hipMalloc((void**)&allThreadsOneD, nBytes)); + HIP_CHECK(hipMalloc((void**)&oneThreadOneD, nBytes)); + HIP_CHECK(hipMalloc((void**)&allThreadsMinusOneD, nBytes)); + + // Allocate host memory + HIP_CHECK(hipHostMalloc((void**)&syncTestH, nBytes)); + HIP_CHECK(hipHostMalloc((void**)&allThreadsZeroH, nBytes)); + HIP_CHECK(hipHostMalloc((void**)&allThreadsOneH, nBytes)); + HIP_CHECK(hipHostMalloc((void**)&oneThreadOneH, nBytes)); + HIP_CHECK(hipHostMalloc((void**)&allThreadsMinusOneH, nBytes)); + + // Launch Kernel + hipLaunchKernelGGL(kernel_syncthreads_or, 2, blockSize, 0, 0, syncTestD, allThreadsZeroD, + allThreadsOneD, oneThreadOneD, allThreadsMinusOneD); + + // Copy result from device to host + HIP_CHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(allThreadsZeroH, allThreadsZeroD, nBytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(allThreadsOneH, allThreadsOneD, nBytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(oneThreadOneH, oneThreadOneD, nBytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(allThreadsMinusOneH, allThreadsMinusOneD, nBytes, hipMemcpyDeviceToHost)); + + // Validate results for both blocks together + for (int i = 0; i < 2 * blockSize; ++i) { + REQUIRE(syncTestH[i] == 10); + REQUIRE(allThreadsZeroH[i] == 0); + REQUIRE(allThreadsOneH[i] == 1); + REQUIRE(oneThreadOneH[i] == 1); + REQUIRE(allThreadsMinusOneH[i] == 1); + } + + // Free device memory + HIP_CHECK(hipFree(syncTestD)); + HIP_CHECK(hipFree(allThreadsZeroD)); + HIP_CHECK(hipFree(allThreadsOneD)); + HIP_CHECK(hipFree(oneThreadOneD)); + HIP_CHECK(hipFree(allThreadsMinusOneD)); + + // Free host memory + HIP_CHECK(hipHostFree(syncTestH)); + HIP_CHECK(hipHostFree(allThreadsZeroH)); + HIP_CHECK(hipHostFree(allThreadsOneH)); + HIP_CHECK(hipHostFree(oneThreadOneH)); + HIP_CHECK(hipHostFree(allThreadsMinusOneH)); +} + + +TEST_CASE("Unit_syncthreads_or") { + int blockSizes[] = {10, 40, 70, 130, 240, 723, 32, 64, 128, 256, 512, 1024}; + for (unsigned long i = 0; i < (sizeof(blockSizes) / sizeof(blockSizes[0])); ++i) + test_syncthreads_or(blockSizes[i]); +} diff --git a/catch/unit/deviceLib/threadfence_system.cc b/catch/unit/deviceLib/threadfence_system.cc new file mode 100644 index 0000000000..a90c511ed7 --- /dev/null +++ b/catch/unit/deviceLib/threadfence_system.cc @@ -0,0 +1,112 @@ +/* +Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +#include +#include + +__host__ __device__ void fence_system() { +#ifdef __HIP_DEVICE_COMPILE__ + __threadfence_system(); +#else + std::atomic_thread_fence(std::memory_order_seq_cst); +#endif +} + +__host__ __device__ void round_robin(const int id, const int num_dev, const int num_iter, + volatile int* data, volatile int* flag) { + for (int i = 0; i < num_iter; i++) { + while (*flag % num_dev != id) fence_system(); // invalid the cache for read + + (*data)++; + fence_system(); // make sure the store to data is sequenced before the store to flag + (*flag)++; + fence_system(); // invalid the cache to flush out flag + } +} + +__global__ void gpu_round_robin(const int id, const int num_dev, const int num_iter, + volatile int* data, volatile int* flag) { + round_robin(id, num_dev, num_iter, data, flag); +} + +TEST_CASE("Unit_threadfence_system") { + int num_gpus = 0; + HIP_CHECK(hipGetDeviceCount(&num_gpus)); + REQUIRE(num_gpus > 0); + + volatile int* data; + if (hipHostMalloc(&data, sizeof(int), hipHostMallocCoherent) != hipSuccess) { + SUCCEED("Memory allocation failed. Skip test. Is SVM atomic supported?"); + } + + constexpr int init_data = 1000; + *data = init_data; + + volatile int* flag; + if (hipHostMalloc(&flag, sizeof(int), hipHostMallocCoherent) != hipSuccess) { + SUCCEED("Memory allocation failed. Skip test. Is SVM atomic supported?"); + } + *flag = 0; + + // number of rounds per device + constexpr int num_iter = 1000; + + // one CPU thread + 1 kernel/GPU + const int num_dev = num_gpus + 1; + + int next_id = 0; + std::vector threads; + + // create a CPU thread for the round_robin + threads.push_back(std::thread(round_robin, next_id++, num_dev, num_iter, data, flag)); + + // run one thread per GPU + dim3 dim_block(1, 1, 1); + dim3 dim_grid(1, 1, 1); + + // launch one kernel per device for the round robin + for (; next_id < num_dev; ++next_id) { + threads.push_back(std::thread([=]() { + HIP_CHECK(hipSetDevice(next_id - 1)); + hipLaunchKernelGGL(gpu_round_robin, dim_grid, dim_block, 0, 0x0, next_id, num_dev, num_iter, + data, flag); + HIP_CHECK(hipDeviceSynchronize()); + })); + } + + for (auto& t : threads) { + t.join(); + } + + int expected_data = init_data + num_dev * num_iter; + int expected_flag = num_dev * num_iter; + + bool passed = *data == expected_data && *flag == expected_flag; + + HIP_CHECK(hipHostFree((void*)data)); + HIP_CHECK(hipHostFree((void*)flag)); + + REQUIRE(passed == true); +}