From 089ab3b9474a7643f4bd85493335def68f8db4d8 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sat, 2 Jun 2018 12:27:17 +0100 Subject: [PATCH 1/3] Switch the atomic implementation to use Clang builtins. --- include/hip/hcc_detail/device_functions.h | 3 +- include/hip/hcc_detail/hip_atomic.h | 265 ++++++++++++++++ include/hip/hcc_detail/hip_runtime.h | 77 +---- include/hip/hip_runtime.h | 2 +- src/device_util.cpp | 123 -------- tests/src/deviceLib/hipSimpleAtomicsTest.cpp | 316 +++++++++++-------- 6 files changed, 447 insertions(+), 339 deletions(-) create mode 100644 include/hip/hcc_detail/hip_atomic.h diff --git a/include/hip/hcc_detail/device_functions.h b/include/hip/hcc_detail/device_functions.h index 28d874b27a..20a365ebbe 100644 --- a/include/hip/hcc_detail/device_functions.h +++ b/include/hip/hcc_detail/device_functions.h @@ -23,7 +23,8 @@ THE SOFTWARE. #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H #define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H -#include +#include "host_defines.h" + #include diff --git a/include/hip/hcc_detail/hip_atomic.h b/include/hip/hcc_detail/hip_atomic.h new file mode 100644 index 0000000000..4af1794ba0 --- /dev/null +++ b/include/hip/hcc_detail/hip_atomic.h @@ -0,0 +1,265 @@ +#pragma once + +#include "device_functions.h" + +__device__ +inline +int atomicCAS(int* address, int compare, int val) +{ + __atomic_compare_exchange_n( + address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); + + return compare; +} +__device__ +inline +unsigned int atomicCAS( + unsigned int* address, unsigned int compare, unsigned int val) +{ + __atomic_compare_exchange_n( + address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); + + return compare; +} +__device__ +inline +unsigned long long atomicCAS( + unsigned long long* address, + unsigned long long compare, + unsigned long long val) +{ + __atomic_compare_exchange_n( + address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); + + return compare; +} + +__device__ +inline +int atomicAdd(int* address, int val) +{ + return __atomic_fetch_add(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned int atomicAdd(unsigned int* address, unsigned int val) +{ + return __atomic_fetch_add(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned long long atomicAdd( + unsigned long long* address, unsigned long long val) +{ + return __atomic_fetch_add(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +float atomicAdd(float* address, float val) +{ + unsigned int* uaddr{reinterpret_cast(uaddr)}; + unsigned int old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; + unsigned int r; + + do { + r = old; + old = atomicCAS(uaddr, r, __float_as_uint(val + __uint_as_float(r))); + } while (r != old); + + return __uint_as_float(r); +} +__device__ +inline +double atomicAdd(double* address, double val) +{ + unsigned long long* uaddr{reinterpret_cast(uaddr)}; + unsigned long long old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; + unsigned long long r; + + do { + r = old; + old = atomicCAS( + uaddr, r, __double_as_longlong(val + __longlong_as_double(r))); + } while (r != old); + + return __longlong_as_double(r); +} + +__device__ +inline +int atomicSub(int* address, int val) +{ + return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned int atomicSub(unsigned int* address, unsigned int val) +{ + return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED); +} + +__device__ +inline +int atomicExch(int* address, int val) +{ + return __atomic_exchange_n(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned int atomicExch(unsigned int* address, unsigned int val) +{ + return __atomic_exchange_n(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned int atomicExch(unsigned long long* address, unsigned long long val) +{ + return __atomic_exchange_n(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +float atomicExch(float* address, float val) +{ + return __uint_as_float(__atomic_exchange_n( + reinterpret_cast(address), + __float_as_uint(val), + __ATOMIC_RELAXED)); +} + +__device__ +inline +int atomicMin(int* address, int val) +{ + return __sync_fetch_and_min(address, val); +} +__device__ +inline +unsigned int atomicMin(unsigned int* address, unsigned int val) +{ + return __sync_fetch_and_umin(address, val); +} +__device__ +inline +unsigned long long atomicMin( + unsigned long long* address, unsigned long long val) +{ + unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)}; + while (val < tmp) { tmp = atomicCAS(address, tmp, val); } + + return tmp; +} + +__device__ +inline +int atomicMax(int* address, int val) +{ + return __sync_fetch_and_max(address, val); +} +__device__ +inline +unsigned int atomicMax(unsigned int* address, unsigned int val) +{ + return __sync_fetch_and_umax(address, val); +} +__device__ +inline +unsigned long long atomicMax( + unsigned long long* address, unsigned long long val) +{ + unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)}; + while (tmp < val) { tmp = atomicCAS(address, tmp, val); } + + return tmp; +} + +__device__ +inline +unsigned int atomicInc(unsigned int* address, unsigned int val) +{ + extern unsigned int __builtin_amdgcn_atomic_inc( + unsigned int*, + unsigned int, + unsigned int, + unsigned int, + bool) __asm("llvm.amdgcn.atomic.inc.i32.p0i32"); + + return __builtin_amdgcn_atomic_inc( + address, val, __ATOMIC_RELAXED, 1 /* Device scope */, false); +} + +__device__ +inline +unsigned int atomicDec(unsigned int* address, unsigned int val) +{ + extern unsigned int __builtin_amdgcn_atomic_dec( + unsigned int*, + unsigned int, + unsigned int, + unsigned int, + bool) __asm("llvm.amdgcn.atomic.dec.i32.p0i32"); + + return __builtin_amdgcn_atomic_dec( + address, val, __ATOMIC_RELAXED, 1 /* Device scope */, false); +} + +__device__ +inline +int atomicAnd(int* address, int val) +{ + return __atomic_fetch_and(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned int atomicAnd(unsigned int* address, unsigned int val) +{ + return __atomic_fetch_and(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned long long atomicAnd( + unsigned long long* address, unsigned long long val) +{ + return __atomic_fetch_and(address, val, __ATOMIC_RELAXED); +} + +__device__ +inline +int atomicOr(int* address, int val) +{ + return __atomic_fetch_or(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned int atomicOr(unsigned int* address, unsigned int val) +{ + return __atomic_fetch_or(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned long long atomicOr( + unsigned long long* address, unsigned long long val) +{ + return __atomic_fetch_or(address, val, __ATOMIC_RELAXED); +} + +__device__ +inline +int atomicXor(int* address, int val) +{ + return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned int atomicXor(unsigned int* address, unsigned int val) +{ + return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED); +} +__device__ +inline +unsigned long long atomicXor( + unsigned long long* address, unsigned long long val) +{ + return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED); +} + +// TODO: add scoped atomics i.e. atomic{*}_system && atomic{*}_block. \ No newline at end of file diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 92f06e9174..c77b4b85bd 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -88,6 +88,7 @@ extern int HIP_TRACE_API; #ifdef __cplusplus #include #endif +#include #include #include #include @@ -175,82 +176,6 @@ __device__ clock_t clock(); // abort __device__ void abort(); -// atomicAdd() -__device__ int atomicAdd(int* address, int val); -__device__ unsigned int atomicAdd(unsigned int* address, unsigned int val); - -__device__ unsigned long long int atomicAdd(unsigned long long int* address, - unsigned long long int val); - -__device__ float atomicAdd(float* address, float val); - - -// atomicSub() -__device__ int atomicSub(int* address, int val); - -__device__ unsigned int atomicSub(unsigned int* address, unsigned int val); - - -// atomicExch() -__device__ int atomicExch(int* address, int val); - -__device__ unsigned int atomicExch(unsigned int* address, unsigned int val); - -__device__ unsigned long long int atomicExch(unsigned long long int* address, - unsigned long long int val); - -__device__ float atomicExch(float* address, float val); - - -// atomicMin() -__device__ int atomicMin(int* address, int val); -__device__ unsigned int atomicMin(unsigned int* address, unsigned int val); -__device__ unsigned long long int atomicMin(unsigned long long int* address, - unsigned long long int val); - - -// atomicMax() -__device__ int atomicMax(int* address, int val); -__device__ unsigned int atomicMax(unsigned int* address, unsigned int val); -__device__ unsigned long long int atomicMax(unsigned long long int* address, - unsigned long long int val); - - -// atomicCAS() -__device__ int atomicCAS(int* address, int compare, int val); -__device__ unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val); -__device__ unsigned long long int atomicCAS(unsigned long long int* address, - unsigned long long int compare, - unsigned long long int val); - - -// atomicAnd() -__device__ int atomicAnd(int* address, int val); -__device__ unsigned int atomicAnd(unsigned int* address, unsigned int val); -__device__ unsigned long long int atomicAnd(unsigned long long int* address, - unsigned long long int val); - - -// atomicOr() -__device__ int atomicOr(int* address, int val); -__device__ unsigned int atomicOr(unsigned int* address, unsigned int val); -__device__ unsigned long long int atomicOr(unsigned long long int* address, - unsigned long long int val); - - -// atomicXor() -__device__ int atomicXor(int* address, int val); -__device__ unsigned int atomicXor(unsigned int* address, unsigned int val); -__device__ unsigned long long int atomicXor(unsigned long long int* address, - unsigned long long int val); - -// atomicInc() -__device__ unsigned int atomicInc(unsigned int* address, unsigned int val); - - -// atomicDec() -__device__ unsigned int atomicDec(unsigned int* address, unsigned int val); - // warp vote function __all __any __ballot __device__ int __all(int input); __device__ int __any(int input); diff --git a/include/hip/hip_runtime.h b/include/hip/hip_runtime.h index 157fc88a43..937ba61ecf 100644 --- a/include/hip/hip_runtime.h +++ b/include/hip/hip_runtime.h @@ -64,4 +64,4 @@ THE SOFTWARE. #include #include -#endif +#endif \ No newline at end of file diff --git a/src/device_util.cpp b/src/device_util.cpp index 5ce014b2b9..11f992a510 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -761,129 +761,6 @@ __device__ clock_t clock() { return (clock_t)hc::__cycle_u64(); }; // abort __device__ void abort() { return hc::abort(); } -// atomicAdd() -__device__ int atomicAdd(int* address, int val) { return hc::atomic_fetch_add(address, val); } -__device__ unsigned int atomicAdd(unsigned int* address, unsigned int val) { - return hc::atomic_fetch_add(address, val); -} -__device__ unsigned long long int atomicAdd(unsigned long long int* address, - unsigned long long int val) { - return (long long int)hc::atomic_fetch_add((uint64_t*)address, (uint64_t)val); -} -__device__ float atomicAdd(float* address, float val) { return hc::atomic_fetch_add(address, val); } - -// atomicSub() -__device__ int atomicSub(int* address, int val) { return hc::atomic_fetch_sub(address, val); } -__device__ unsigned int atomicSub(unsigned int* address, unsigned int val) { - return hc::atomic_fetch_sub(address, val); -} - -// atomicExch() -__device__ int atomicExch(int* address, int val) { return hc::atomic_exchange(address, val); } -__device__ unsigned int atomicExch(unsigned int* address, unsigned int val) { - return hc::atomic_exchange(address, val); -} -__device__ unsigned long long int atomicExch(unsigned long long int* address, - unsigned long long int val) { - return (long long int)hc::atomic_exchange((uint64_t*)address, (uint64_t)val); -} -__device__ float atomicExch(float* address, float val) { return hc::atomic_exchange(address, val); } - -// atomicMin() -__device__ int atomicMin(int* address, int val) { return hc::atomic_fetch_min(address, val); } -__device__ unsigned int atomicMin(unsigned int* address, unsigned int val) { - return hc::atomic_fetch_min(address, val); -} -__device__ unsigned long long int atomicMin(unsigned long long int* address, - unsigned long long int val) { - return (long long int)hc::atomic_fetch_min((uint64_t*)address, (uint64_t)val); -} - -// atomicMax() -__device__ int atomicMax(int* address, int val) { return hc::atomic_fetch_max(address, val); } -__device__ unsigned int atomicMax(unsigned int* address, unsigned int val) { - return hc::atomic_fetch_max(address, val); -} -__device__ unsigned long long int atomicMax(unsigned long long int* address, - unsigned long long int val) { - return (long long int)hc::atomic_fetch_max((uint64_t*)address, (uint64_t)val); -} - -// atomicCAS() -template -__device__ T atomicCAS_impl(T* address, T compare, T val) { - // the implementation assumes the atomic is lock-free and - // has the same size as the non-atmoic equivalent type - static_assert(sizeof(T) == sizeof(std::atomic), - "size mismatch between atomic and non-atomic types"); - - union { - T* address; - std::atomic* atomic_address; - } u; - u.address = address; - - T expected = compare; - - // hcc should generate a system scope atomic CAS - std::atomic_compare_exchange_weak_explicit( - u.atomic_address, &expected, val, std::memory_order_acq_rel, std::memory_order_relaxed); - return expected; -} - -__device__ int atomicCAS(int* address, int compare, int val) { - return atomicCAS_impl(address, compare, val); -} -__device__ unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) { - return atomicCAS_impl(address, compare, val); -} -__device__ unsigned long long int atomicCAS(unsigned long long int* address, - unsigned long long int compare, - unsigned long long int val) { - return atomicCAS_impl(address, compare, val); -} - -// atomicAnd() -__device__ int atomicAnd(int* address, int val) { return hc::atomic_fetch_and(address, val); } -__device__ unsigned int atomicAnd(unsigned int* address, unsigned int val) { - return hc::atomic_fetch_and(address, val); -} -__device__ unsigned long long int atomicAnd(unsigned long long int* address, - unsigned long long int val) { - return (long long int)hc::atomic_fetch_and((uint64_t*)address, (uint64_t)val); -} - -// atomicOr() -__device__ int atomicOr(int* address, int val) { return hc::atomic_fetch_or(address, val); } -__device__ unsigned int atomicOr(unsigned int* address, unsigned int val) { - return hc::atomic_fetch_or(address, val); -} -__device__ unsigned long long int atomicOr(unsigned long long int* address, - unsigned long long int val) { - return (long long int)hc::atomic_fetch_or((uint64_t*)address, (uint64_t)val); -} - -// atomicXor() -__device__ int atomicXor(int* address, int val) { return hc::atomic_fetch_xor(address, val); } -__device__ unsigned int atomicXor(unsigned int* address, unsigned int val) { - return hc::atomic_fetch_xor(address, val); -} -__device__ unsigned long long int atomicXor(unsigned long long int* address, - unsigned long long int val) { - return (long long int)hc::atomic_fetch_xor((uint64_t*)address, (uint64_t)val); -} - -// atomicInc -__device__ unsigned int atomicInc(unsigned int* address, unsigned int val) { - return hc::__atomic_wrapinc(address, val); -} - -// atomicDec -__device__ unsigned int atomicDec(unsigned int* address, unsigned int val) { - return hc::__atomic_wrapdec(address, val); -} - - // warp vote function __all __any __ballot __device__ int __all(int input) { return hc::__all(input); } diff --git a/tests/src/deviceLib/hipSimpleAtomicsTest.cpp b/tests/src/deviceLib/hipSimpleAtomicsTest.cpp index ce1d0372ff..129d7c1aa8 100644 --- a/tests/src/deviceLib/hipSimpleAtomicsTest.cpp +++ b/tests/src/deviceLib/hipSimpleAtomicsTest.cpp @@ -23,134 +23,37 @@ THE SOFTWARE. * HIT_END */ -// includes, system -#include -#include -#include -#include - - // Includes HIP Runtime #include "hip/hip_runtime.h" #include +// includes, system +#include +#include +#include +#include +#include +#include + #define EXIT_WAIVED 2 const char* sampleName = "hipSimpleAtomicsTest"; +using namespace std; + //////////////////////////////////////////////////////////////////////////////// // Auto-Verification Code bool testResult = true; //////////////////////////////////////////////////////////////////////////////// -// Declaration, forward -void runTest(int argc, char** argv); +bool computeGoldBitwise(...) { + return true; +} -#define min(a, b) (a) < (b) ? (a) : (b) -#define max(a, b) (a) > (b) ? (a) : (b) - -int computeGold(int* gpuData, const int len) { - int val = 0; - - for (int i = 0; i < len; ++i) { - val += 10; - } - - if (val != gpuData[0]) { - printf("atomicAdd failed\n"); - return false; - } - - val = 0; - - for (int i = 0; i < len; ++i) { - val -= 10; - } - - if (val != gpuData[1]) { - printf("atomicSub failed\n"); - return false; - } - - bool found = false; - - for (int i = 0; i < len; ++i) { - // third element should be a member of [0, len) - if (i == gpuData[2]) { - found = true; - break; - } - } - - if (!found) { - printf("atomicExch failed\n"); - return false; - } - - val = -(1 << 8); - - for (int i = 0; i < len; ++i) { - // fourth element should be len-1 - val = max(val, i); - } - - if (val != gpuData[3]) { - printf("atomicMax failed\n"); - return false; - } - - val = 1 << 8; - - for (int i = 0; i < len; ++i) { - val = min(val, i); - } - - if (val != gpuData[4]) { - printf("atomicMin failed\n"); - return false; - } - - int limit = 17; - val = 0; - - for (int i = 0; i < len; ++i) { - val = (val >= limit) ? 0 : val + 1; - } - - if (val != gpuData[5]) { - printf("atomicInc failed\n"); - return false; - } - - limit = 137; - val = 0; - - for (int i = 0; i < len; ++i) { - val = ((val == 0) || (val > limit)) ? limit : val - 1; - } - - if (val != gpuData[6]) { - printf("atomicDec failed\n"); - return false; - } - - found = false; - - for (int i = 0; i < len; ++i) { - // eighth element should be a member of [0, len) - if (i == gpuData[7]) { - found = true; - break; - } - } - - if (!found) { - printf("atomicCAS failed\n"); - return false; - } - - val = 0xff; +template{}>::type* = nullptr> +bool computeGoldBitwise(T* gpuData, int len) { + T val = 0xff; for (int i = 0; i < len; ++i) { // 9th element should be 1 @@ -189,22 +92,142 @@ int computeGold(int* gpuData, const int len) { return true; } -__global__ void testKernel(hipLaunchParm lp, int* g_odata) { +template +bool computeGold(T* gpuData, int len) { + T val = 0; + + for (int i = 0; i < len; ++i) { + val += 10; + } + + if (val != gpuData[0]) { + printf("atomicAdd failed\n"); + return false; + } + + val = 0; + + for (int i = 0; i < len; ++i) { + val -= 10; + } + + if (val != gpuData[1]) { + printf("atomicSub failed\n"); + return false; + } + + bool found = false; + + for (T i = 0; i < len; ++i) { + // third element should be a member of [0, len) + if (i == gpuData[2]) { + found = true; + break; + } + } + + if (!found) { + printf("atomicExch failed\n"); + return false; + } + + val = -(1 << 8); + + for (T i = 0; i < len; ++i) { + // fourth element should be len-1 + val = max(val, i); + } + + if (val != gpuData[3]) { + printf("atomicMax failed\n"); + return false; + } + + val = 1 << 8; + + for (T i = 0; i < len; ++i) { + val = min(val, i); + } + + if (val != gpuData[4]) { + printf("atomicMin failed\n"); + return false; + } + + int limit = 17; + val = 0; + + for (int i = 0; i < len; ++i) { + val = (val >= limit) ? 0 : val + 1; + } + + if (val != gpuData[5]) { + printf("atomicInc failed\n"); + return false; + } + + limit = 137; + val = 0; + + for (int i = 0; i < len; ++i) { + val = ((val == 0) || (val > limit)) ? limit : val - 1; + } + + if (val != gpuData[6]) { + printf("atomicDec failed\n"); + return false; + } + + found = false; + + for (T i = 0; i < len; ++i) { + // eighth element should be a member of [0, len) + if (i == gpuData[7]) { + found = true; + break; + } + } + if (!found) { + printf("atomicCAS failed\n"); + return false; + } + + return computeGoldBitwise(gpuData, len); +} + +__device__ +void testKernelExch(...) {} + +template{}>::type* = nullptr> +__device__ +void testKernelExch(T* g_odata) { // access thread id - const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; - - // Test various atomic instructions - - // Arithmetic atomic instructions - - // Atomic addition - atomicAdd(&g_odata[0], 10); - - // Atomic subtraction (final should be 0) - atomicSub(&g_odata[1], 10); + const T tid = blockDim.x * blockIdx.x + threadIdx.x; // Atomic exchange atomicExch(&g_odata[2], tid); +} + +__device__ +void testKernelSub(...) {} + +template< + typename T, + typename enable_if< + is_same{} || is_same{}>::type* = nullptr> +void testKernelSub(T* g_odata) { + // Atomic subtraction (final should be 0) + atomicSub(&g_odata[1], 10); +} + +__device__ +void testKernelIntegral(...) {} + +template{}>::type* = nullptr> +__device__ +void testKernelIntegral(T* g_odata) { + // access thread id + const T tid = blockDim.x * blockIdx.x + threadIdx.x; // Atomic maximum atomicMax(&g_odata[3], tid); @@ -231,20 +254,21 @@ __global__ void testKernel(hipLaunchParm lp, int* g_odata) { // Atomic XOR atomicXor(&g_odata[10], tid); + + testKernelSub(g_odata); } +template +__global__ void testKernel(T* g_odata) { + // Atomic addition + atomicAdd(&g_odata[0], 10); -int main(int argc, char** argv) { - printf("%s starting...\n", sampleName); - - runTest(argc, argv); - - hipDeviceReset(); - printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!"); - exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); + testKernelIntegral(g_odata); + testKernelExch(g_odata); } -void runTest(int argc, char** argv) { +template +void runTest() { hipDeviceProp_t deviceProp; deviceProp.major = 0; deviceProp.minor = 0; @@ -262,10 +286,10 @@ void runTest(int argc, char** argv) { unsigned int numThreads = 256; unsigned int numBlocks = 64; unsigned int numData = 11; - unsigned int memSize = sizeof(int) * numData; + unsigned int memSize = sizeof(T) * numData; // allocate mem for the result on host side - int* hOData = (int*)malloc(memSize); + T* hOData = (T*)malloc(memSize); // initialize the memory for (unsigned int i = 0; i < numData; i++) hOData[i] = 0; @@ -274,13 +298,14 @@ void runTest(int argc, char** argv) { hOData[8] = hOData[10] = 0xff; // allocate device memory for result - int* dOData; + T* dOData; hipMalloc((void**)&dOData, memSize); // copy host memory to device to initialize to zero hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice); // execute the kernel - hipLaunchKernel(testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData); + hipLaunchKernelGGL( + testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData); // Copy result from device to host hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost); @@ -294,3 +319,18 @@ void runTest(int argc, char** argv) { passed(); } + + +int main(int argc, char** argv) { + printf("%s starting...\n", sampleName); + + runTest(); + runTest(); + runTest(); + runTest(); + runTest(); + + hipDeviceReset(); + printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!"); + exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); +} \ No newline at end of file From 59adb5e52a015a09332f698e5eb82b98789fdb0e Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sat, 2 Jun 2018 17:46:37 +0100 Subject: [PATCH 2/3] Add missing __device__ for forward declares. --- include/hip/hcc_detail/hip_atomic.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/include/hip/hcc_detail/hip_atomic.h b/include/hip/hcc_detail/hip_atomic.h index 4af1794ba0..1304cf74a9 100644 --- a/include/hip/hcc_detail/hip_atomic.h +++ b/include/hip/hcc_detail/hip_atomic.h @@ -176,7 +176,9 @@ __device__ inline unsigned int atomicInc(unsigned int* address, unsigned int val) { - extern unsigned int __builtin_amdgcn_atomic_inc( + __device__ + extern + unsigned int __builtin_amdgcn_atomic_inc( unsigned int*, unsigned int, unsigned int, @@ -191,7 +193,9 @@ __device__ inline unsigned int atomicDec(unsigned int* address, unsigned int val) { - extern unsigned int __builtin_amdgcn_atomic_dec( + __device__ + extern + unsigned int __builtin_amdgcn_atomic_dec( unsigned int*, unsigned int, unsigned int, @@ -262,4 +266,4 @@ unsigned long long atomicXor( return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED); } -// TODO: add scoped atomics i.e. atomic{*}_system && atomic{*}_block. \ No newline at end of file +// TODO: add scoped atomics i.e. atomic{*}_system && atomic{*}_block. From 23f5feaf1315917019c9ae9fdf2911b882c73560 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Sun, 3 Jun 2018 03:03:55 +0100 Subject: [PATCH 3/3] Fix hideous typos. --- include/hip/hcc_detail/hip_atomic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/hip/hcc_detail/hip_atomic.h b/include/hip/hcc_detail/hip_atomic.h index 1304cf74a9..ce4b7c9008 100644 --- a/include/hip/hcc_detail/hip_atomic.h +++ b/include/hip/hcc_detail/hip_atomic.h @@ -57,7 +57,7 @@ __device__ inline float atomicAdd(float* address, float val) { - unsigned int* uaddr{reinterpret_cast(uaddr)}; + unsigned int* uaddr{reinterpret_cast(address)}; unsigned int old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; unsigned int r; @@ -72,7 +72,7 @@ __device__ inline double atomicAdd(double* address, double val) { - unsigned long long* uaddr{reinterpret_cast(uaddr)}; + unsigned long long* uaddr{reinterpret_cast(address)}; unsigned long long old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; unsigned long long r;