Switch the atomic implementation to use Clang builtins.
[ROCm/hip commit: 089ab3b947]
このコミットが含まれているのは:
@@ -23,7 +23,8 @@ THE SOFTWARE.
|
||||
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
|
||||
#define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "host_defines.h"
|
||||
|
||||
#include <hip/hip_vector_types.h>
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,265 @@
|
||||
#pragma once
|
||||
|
||||
#include "device_functions.h"
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicCAS(int* address, int compare, int val)
|
||||
{
|
||||
__atomic_compare_exchange_n(
|
||||
address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
|
||||
return compare;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicCAS(
|
||||
unsigned int* address, unsigned int compare, unsigned int val)
|
||||
{
|
||||
__atomic_compare_exchange_n(
|
||||
address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
|
||||
return compare;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicCAS(
|
||||
unsigned long long* address,
|
||||
unsigned long long compare,
|
||||
unsigned long long val)
|
||||
{
|
||||
__atomic_compare_exchange_n(
|
||||
address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
||||
|
||||
return compare;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicAdd(int* address, int val)
|
||||
{
|
||||
return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicAdd(unsigned int* address, unsigned int val)
|
||||
{
|
||||
return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicAdd(
|
||||
unsigned long long* address, unsigned long long val)
|
||||
{
|
||||
return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
float atomicAdd(float* address, float val)
|
||||
{
|
||||
unsigned int* uaddr{reinterpret_cast<unsigned int*>(uaddr)};
|
||||
unsigned int old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
|
||||
unsigned int r;
|
||||
|
||||
do {
|
||||
r = old;
|
||||
old = atomicCAS(uaddr, r, __float_as_uint(val + __uint_as_float(r)));
|
||||
} while (r != old);
|
||||
|
||||
return __uint_as_float(r);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
double atomicAdd(double* address, double val)
|
||||
{
|
||||
unsigned long long* uaddr{reinterpret_cast<unsigned long long*>(uaddr)};
|
||||
unsigned long long old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
|
||||
unsigned long long r;
|
||||
|
||||
do {
|
||||
r = old;
|
||||
old = atomicCAS(
|
||||
uaddr, r, __double_as_longlong(val + __longlong_as_double(r)));
|
||||
} while (r != old);
|
||||
|
||||
return __longlong_as_double(r);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicSub(int* address, int val)
|
||||
{
|
||||
return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicSub(unsigned int* address, unsigned int val)
|
||||
{
|
||||
return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicExch(int* address, int val)
|
||||
{
|
||||
return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicExch(unsigned int* address, unsigned int val)
|
||||
{
|
||||
return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicExch(unsigned long long* address, unsigned long long val)
|
||||
{
|
||||
return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
float atomicExch(float* address, float val)
|
||||
{
|
||||
return __uint_as_float(__atomic_exchange_n(
|
||||
reinterpret_cast<unsigned int*>(address),
|
||||
__float_as_uint(val),
|
||||
__ATOMIC_RELAXED));
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicMin(int* address, int val)
|
||||
{
|
||||
return __sync_fetch_and_min(address, val);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicMin(unsigned int* address, unsigned int val)
|
||||
{
|
||||
return __sync_fetch_and_umin(address, val);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicMin(
|
||||
unsigned long long* address, unsigned long long val)
|
||||
{
|
||||
unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
|
||||
while (val < tmp) { tmp = atomicCAS(address, tmp, val); }
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicMax(int* address, int val)
|
||||
{
|
||||
return __sync_fetch_and_max(address, val);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicMax(unsigned int* address, unsigned int val)
|
||||
{
|
||||
return __sync_fetch_and_umax(address, val);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicMax(
|
||||
unsigned long long* address, unsigned long long val)
|
||||
{
|
||||
unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
|
||||
while (tmp < val) { tmp = atomicCAS(address, tmp, val); }
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicInc(unsigned int* address, unsigned int val)
|
||||
{
|
||||
extern unsigned int __builtin_amdgcn_atomic_inc(
|
||||
unsigned int*,
|
||||
unsigned int,
|
||||
unsigned int,
|
||||
unsigned int,
|
||||
bool) __asm("llvm.amdgcn.atomic.inc.i32.p0i32");
|
||||
|
||||
return __builtin_amdgcn_atomic_inc(
|
||||
address, val, __ATOMIC_RELAXED, 1 /* Device scope */, false);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicDec(unsigned int* address, unsigned int val)
|
||||
{
|
||||
extern unsigned int __builtin_amdgcn_atomic_dec(
|
||||
unsigned int*,
|
||||
unsigned int,
|
||||
unsigned int,
|
||||
unsigned int,
|
||||
bool) __asm("llvm.amdgcn.atomic.dec.i32.p0i32");
|
||||
|
||||
return __builtin_amdgcn_atomic_dec(
|
||||
address, val, __ATOMIC_RELAXED, 1 /* Device scope */, false);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicAnd(int* address, int val)
|
||||
{
|
||||
return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicAnd(unsigned int* address, unsigned int val)
|
||||
{
|
||||
return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicAnd(
|
||||
unsigned long long* address, unsigned long long val)
|
||||
{
|
||||
return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicOr(int* address, int val)
|
||||
{
|
||||
return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicOr(unsigned int* address, unsigned int val)
|
||||
{
|
||||
return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicOr(
|
||||
unsigned long long* address, unsigned long long val)
|
||||
{
|
||||
return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int atomicXor(int* address, int val)
|
||||
{
|
||||
return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned int atomicXor(unsigned int* address, unsigned int val)
|
||||
{
|
||||
return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long long atomicXor(
|
||||
unsigned long long* address, unsigned long long val)
|
||||
{
|
||||
return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
// TODO: add scoped atomics i.e. atomic{*}_system && atomic{*}_block.
|
||||
@@ -88,6 +88,7 @@ extern int HIP_TRACE_API;
|
||||
#ifdef __cplusplus
|
||||
#include <hip/hcc_detail/hip_ldg.h>
|
||||
#endif
|
||||
#include <hip/hcc_detail/hip_atomic.h>
|
||||
#include <hip/hcc_detail/host_defines.h>
|
||||
#include <hip/hcc_detail/math_functions.h>
|
||||
#include <hip/hcc_detail/device_functions.h>
|
||||
@@ -175,82 +176,6 @@ __device__ clock_t clock();
|
||||
// abort
|
||||
__device__ void abort();
|
||||
|
||||
// atomicAdd()
|
||||
__device__ int atomicAdd(int* address, int val);
|
||||
__device__ unsigned int atomicAdd(unsigned int* address, unsigned int val);
|
||||
|
||||
__device__ unsigned long long int atomicAdd(unsigned long long int* address,
|
||||
unsigned long long int val);
|
||||
|
||||
__device__ float atomicAdd(float* address, float val);
|
||||
|
||||
|
||||
// atomicSub()
|
||||
__device__ int atomicSub(int* address, int val);
|
||||
|
||||
__device__ unsigned int atomicSub(unsigned int* address, unsigned int val);
|
||||
|
||||
|
||||
// atomicExch()
|
||||
__device__ int atomicExch(int* address, int val);
|
||||
|
||||
__device__ unsigned int atomicExch(unsigned int* address, unsigned int val);
|
||||
|
||||
__device__ unsigned long long int atomicExch(unsigned long long int* address,
|
||||
unsigned long long int val);
|
||||
|
||||
__device__ float atomicExch(float* address, float val);
|
||||
|
||||
|
||||
// atomicMin()
|
||||
__device__ int atomicMin(int* address, int val);
|
||||
__device__ unsigned int atomicMin(unsigned int* address, unsigned int val);
|
||||
__device__ unsigned long long int atomicMin(unsigned long long int* address,
|
||||
unsigned long long int val);
|
||||
|
||||
|
||||
// atomicMax()
|
||||
__device__ int atomicMax(int* address, int val);
|
||||
__device__ unsigned int atomicMax(unsigned int* address, unsigned int val);
|
||||
__device__ unsigned long long int atomicMax(unsigned long long int* address,
|
||||
unsigned long long int val);
|
||||
|
||||
|
||||
// atomicCAS()
|
||||
__device__ int atomicCAS(int* address, int compare, int val);
|
||||
__device__ unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val);
|
||||
__device__ unsigned long long int atomicCAS(unsigned long long int* address,
|
||||
unsigned long long int compare,
|
||||
unsigned long long int val);
|
||||
|
||||
|
||||
// atomicAnd()
|
||||
__device__ int atomicAnd(int* address, int val);
|
||||
__device__ unsigned int atomicAnd(unsigned int* address, unsigned int val);
|
||||
__device__ unsigned long long int atomicAnd(unsigned long long int* address,
|
||||
unsigned long long int val);
|
||||
|
||||
|
||||
// atomicOr()
|
||||
__device__ int atomicOr(int* address, int val);
|
||||
__device__ unsigned int atomicOr(unsigned int* address, unsigned int val);
|
||||
__device__ unsigned long long int atomicOr(unsigned long long int* address,
|
||||
unsigned long long int val);
|
||||
|
||||
|
||||
// atomicXor()
|
||||
__device__ int atomicXor(int* address, int val);
|
||||
__device__ unsigned int atomicXor(unsigned int* address, unsigned int val);
|
||||
__device__ unsigned long long int atomicXor(unsigned long long int* address,
|
||||
unsigned long long int val);
|
||||
|
||||
// atomicInc()
|
||||
__device__ unsigned int atomicInc(unsigned int* address, unsigned int val);
|
||||
|
||||
|
||||
// atomicDec()
|
||||
__device__ unsigned int atomicDec(unsigned int* address, unsigned int val);
|
||||
|
||||
// warp vote function __all __any __ballot
|
||||
__device__ int __all(int input);
|
||||
__device__ int __any(int input);
|
||||
|
||||
@@ -64,4 +64,4 @@ THE SOFTWARE.
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_vector_types.h>
|
||||
|
||||
#endif
|
||||
#endif
|
||||
@@ -761,129 +761,6 @@ __device__ clock_t clock() { return (clock_t)hc::__cycle_u64(); };
|
||||
// abort
|
||||
__device__ void abort() { return hc::abort(); }
|
||||
|
||||
// atomicAdd()
|
||||
__device__ int atomicAdd(int* address, int val) { return hc::atomic_fetch_add(address, val); }
|
||||
__device__ unsigned int atomicAdd(unsigned int* address, unsigned int val) {
|
||||
return hc::atomic_fetch_add(address, val);
|
||||
}
|
||||
__device__ unsigned long long int atomicAdd(unsigned long long int* address,
|
||||
unsigned long long int val) {
|
||||
return (long long int)hc::atomic_fetch_add((uint64_t*)address, (uint64_t)val);
|
||||
}
|
||||
__device__ float atomicAdd(float* address, float val) { return hc::atomic_fetch_add(address, val); }
|
||||
|
||||
// atomicSub()
|
||||
__device__ int atomicSub(int* address, int val) { return hc::atomic_fetch_sub(address, val); }
|
||||
__device__ unsigned int atomicSub(unsigned int* address, unsigned int val) {
|
||||
return hc::atomic_fetch_sub(address, val);
|
||||
}
|
||||
|
||||
// atomicExch()
|
||||
__device__ int atomicExch(int* address, int val) { return hc::atomic_exchange(address, val); }
|
||||
__device__ unsigned int atomicExch(unsigned int* address, unsigned int val) {
|
||||
return hc::atomic_exchange(address, val);
|
||||
}
|
||||
__device__ unsigned long long int atomicExch(unsigned long long int* address,
|
||||
unsigned long long int val) {
|
||||
return (long long int)hc::atomic_exchange((uint64_t*)address, (uint64_t)val);
|
||||
}
|
||||
__device__ float atomicExch(float* address, float val) { return hc::atomic_exchange(address, val); }
|
||||
|
||||
// atomicMin()
|
||||
__device__ int atomicMin(int* address, int val) { return hc::atomic_fetch_min(address, val); }
|
||||
__device__ unsigned int atomicMin(unsigned int* address, unsigned int val) {
|
||||
return hc::atomic_fetch_min(address, val);
|
||||
}
|
||||
__device__ unsigned long long int atomicMin(unsigned long long int* address,
|
||||
unsigned long long int val) {
|
||||
return (long long int)hc::atomic_fetch_min((uint64_t*)address, (uint64_t)val);
|
||||
}
|
||||
|
||||
// atomicMax()
|
||||
__device__ int atomicMax(int* address, int val) { return hc::atomic_fetch_max(address, val); }
|
||||
__device__ unsigned int atomicMax(unsigned int* address, unsigned int val) {
|
||||
return hc::atomic_fetch_max(address, val);
|
||||
}
|
||||
__device__ unsigned long long int atomicMax(unsigned long long int* address,
|
||||
unsigned long long int val) {
|
||||
return (long long int)hc::atomic_fetch_max((uint64_t*)address, (uint64_t)val);
|
||||
}
|
||||
|
||||
// atomicCAS()
|
||||
template <typename T>
|
||||
__device__ T atomicCAS_impl(T* address, T compare, T val) {
|
||||
// the implementation assumes the atomic is lock-free and
|
||||
// has the same size as the non-atmoic equivalent type
|
||||
static_assert(sizeof(T) == sizeof(std::atomic<T>),
|
||||
"size mismatch between atomic and non-atomic types");
|
||||
|
||||
union {
|
||||
T* address;
|
||||
std::atomic<T>* atomic_address;
|
||||
} u;
|
||||
u.address = address;
|
||||
|
||||
T expected = compare;
|
||||
|
||||
// hcc should generate a system scope atomic CAS
|
||||
std::atomic_compare_exchange_weak_explicit(
|
||||
u.atomic_address, &expected, val, std::memory_order_acq_rel, std::memory_order_relaxed);
|
||||
return expected;
|
||||
}
|
||||
|
||||
__device__ int atomicCAS(int* address, int compare, int val) {
|
||||
return atomicCAS_impl(address, compare, val);
|
||||
}
|
||||
__device__ unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) {
|
||||
return atomicCAS_impl(address, compare, val);
|
||||
}
|
||||
__device__ unsigned long long int atomicCAS(unsigned long long int* address,
|
||||
unsigned long long int compare,
|
||||
unsigned long long int val) {
|
||||
return atomicCAS_impl(address, compare, val);
|
||||
}
|
||||
|
||||
// atomicAnd()
|
||||
__device__ int atomicAnd(int* address, int val) { return hc::atomic_fetch_and(address, val); }
|
||||
__device__ unsigned int atomicAnd(unsigned int* address, unsigned int val) {
|
||||
return hc::atomic_fetch_and(address, val);
|
||||
}
|
||||
__device__ unsigned long long int atomicAnd(unsigned long long int* address,
|
||||
unsigned long long int val) {
|
||||
return (long long int)hc::atomic_fetch_and((uint64_t*)address, (uint64_t)val);
|
||||
}
|
||||
|
||||
// atomicOr()
|
||||
__device__ int atomicOr(int* address, int val) { return hc::atomic_fetch_or(address, val); }
|
||||
__device__ unsigned int atomicOr(unsigned int* address, unsigned int val) {
|
||||
return hc::atomic_fetch_or(address, val);
|
||||
}
|
||||
__device__ unsigned long long int atomicOr(unsigned long long int* address,
|
||||
unsigned long long int val) {
|
||||
return (long long int)hc::atomic_fetch_or((uint64_t*)address, (uint64_t)val);
|
||||
}
|
||||
|
||||
// atomicXor()
|
||||
__device__ int atomicXor(int* address, int val) { return hc::atomic_fetch_xor(address, val); }
|
||||
__device__ unsigned int atomicXor(unsigned int* address, unsigned int val) {
|
||||
return hc::atomic_fetch_xor(address, val);
|
||||
}
|
||||
__device__ unsigned long long int atomicXor(unsigned long long int* address,
|
||||
unsigned long long int val) {
|
||||
return (long long int)hc::atomic_fetch_xor((uint64_t*)address, (uint64_t)val);
|
||||
}
|
||||
|
||||
// atomicInc
|
||||
__device__ unsigned int atomicInc(unsigned int* address, unsigned int val) {
|
||||
return hc::__atomic_wrapinc(address, val);
|
||||
}
|
||||
|
||||
// atomicDec
|
||||
__device__ unsigned int atomicDec(unsigned int* address, unsigned int val) {
|
||||
return hc::__atomic_wrapdec(address, val);
|
||||
}
|
||||
|
||||
|
||||
// warp vote function __all __any __ballot
|
||||
__device__ int __all(int input) { return hc::__all(input); }
|
||||
|
||||
|
||||
@@ -23,134 +23,37 @@ THE SOFTWARE.
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
// includes, system
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
|
||||
// Includes HIP Runtime
|
||||
#include "hip/hip_runtime.h"
|
||||
#include <test_common.h>
|
||||
|
||||
// includes, system
|
||||
#include <algorithm>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <type_traits>
|
||||
|
||||
#define EXIT_WAIVED 2
|
||||
|
||||
const char* sampleName = "hipSimpleAtomicsTest";
|
||||
|
||||
using namespace std;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Auto-Verification Code
|
||||
bool testResult = true;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Declaration, forward
|
||||
void runTest(int argc, char** argv);
|
||||
|
||||
bool computeGoldBitwise(...) {
|
||||
return true;
|
||||
}
|
||||
|
||||
#define min(a, b) (a) < (b) ? (a) : (b)
|
||||
#define max(a, b) (a) > (b) ? (a) : (b)
|
||||
|
||||
int computeGold(int* gpuData, const int len) {
|
||||
int val = 0;
|
||||
|
||||
for (int i = 0; i < len; ++i) {
|
||||
val += 10;
|
||||
}
|
||||
|
||||
if (val != gpuData[0]) {
|
||||
printf("atomicAdd failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
val = 0;
|
||||
|
||||
for (int i = 0; i < len; ++i) {
|
||||
val -= 10;
|
||||
}
|
||||
|
||||
if (val != gpuData[1]) {
|
||||
printf("atomicSub failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
bool found = false;
|
||||
|
||||
for (int i = 0; i < len; ++i) {
|
||||
// third element should be a member of [0, len)
|
||||
if (i == gpuData[2]) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
printf("atomicExch failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
val = -(1 << 8);
|
||||
|
||||
for (int i = 0; i < len; ++i) {
|
||||
// fourth element should be len-1
|
||||
val = max(val, i);
|
||||
}
|
||||
|
||||
if (val != gpuData[3]) {
|
||||
printf("atomicMax failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
val = 1 << 8;
|
||||
|
||||
for (int i = 0; i < len; ++i) {
|
||||
val = min(val, i);
|
||||
}
|
||||
|
||||
if (val != gpuData[4]) {
|
||||
printf("atomicMin failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
int limit = 17;
|
||||
val = 0;
|
||||
|
||||
for (int i = 0; i < len; ++i) {
|
||||
val = (val >= limit) ? 0 : val + 1;
|
||||
}
|
||||
|
||||
if (val != gpuData[5]) {
|
||||
printf("atomicInc failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
limit = 137;
|
||||
val = 0;
|
||||
|
||||
for (int i = 0; i < len; ++i) {
|
||||
val = ((val == 0) || (val > limit)) ? limit : val - 1;
|
||||
}
|
||||
|
||||
if (val != gpuData[6]) {
|
||||
printf("atomicDec failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
found = false;
|
||||
|
||||
for (int i = 0; i < len; ++i) {
|
||||
// eighth element should be a member of [0, len)
|
||||
if (i == gpuData[7]) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
printf("atomicCAS failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
val = 0xff;
|
||||
template<typename T, typename enable_if<is_integral<T>{}>::type* = nullptr>
|
||||
bool computeGoldBitwise(T* gpuData, int len) {
|
||||
T val = 0xff;
|
||||
|
||||
for (int i = 0; i < len; ++i) {
|
||||
// 9th element should be 1
|
||||
@@ -189,22 +92,142 @@ int computeGold(int* gpuData, const int len) {
|
||||
return true;
|
||||
}
|
||||
|
||||
__global__ void testKernel(hipLaunchParm lp, int* g_odata) {
|
||||
template<typename T>
|
||||
bool computeGold(T* gpuData, int len) {
|
||||
T val = 0;
|
||||
|
||||
for (int i = 0; i < len; ++i) {
|
||||
val += 10;
|
||||
}
|
||||
|
||||
if (val != gpuData[0]) {
|
||||
printf("atomicAdd failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
val = 0;
|
||||
|
||||
for (int i = 0; i < len; ++i) {
|
||||
val -= 10;
|
||||
}
|
||||
|
||||
if (val != gpuData[1]) {
|
||||
printf("atomicSub failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
bool found = false;
|
||||
|
||||
for (T i = 0; i < len; ++i) {
|
||||
// third element should be a member of [0, len)
|
||||
if (i == gpuData[2]) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
printf("atomicExch failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
val = -(1 << 8);
|
||||
|
||||
for (T i = 0; i < len; ++i) {
|
||||
// fourth element should be len-1
|
||||
val = max(val, i);
|
||||
}
|
||||
|
||||
if (val != gpuData[3]) {
|
||||
printf("atomicMax failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
val = 1 << 8;
|
||||
|
||||
for (T i = 0; i < len; ++i) {
|
||||
val = min(val, i);
|
||||
}
|
||||
|
||||
if (val != gpuData[4]) {
|
||||
printf("atomicMin failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
int limit = 17;
|
||||
val = 0;
|
||||
|
||||
for (int i = 0; i < len; ++i) {
|
||||
val = (val >= limit) ? 0 : val + 1;
|
||||
}
|
||||
|
||||
if (val != gpuData[5]) {
|
||||
printf("atomicInc failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
limit = 137;
|
||||
val = 0;
|
||||
|
||||
for (int i = 0; i < len; ++i) {
|
||||
val = ((val == 0) || (val > limit)) ? limit : val - 1;
|
||||
}
|
||||
|
||||
if (val != gpuData[6]) {
|
||||
printf("atomicDec failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
found = false;
|
||||
|
||||
for (T i = 0; i < len; ++i) {
|
||||
// eighth element should be a member of [0, len)
|
||||
if (i == gpuData[7]) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
printf("atomicCAS failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
return computeGoldBitwise(gpuData, len);
|
||||
}
|
||||
|
||||
__device__
|
||||
void testKernelExch(...) {}
|
||||
|
||||
template<typename T, typename enable_if<!is_same<T, double>{}>::type* = nullptr>
|
||||
__device__
|
||||
void testKernelExch(T* g_odata) {
|
||||
// access thread id
|
||||
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
// Test various atomic instructions
|
||||
|
||||
// Arithmetic atomic instructions
|
||||
|
||||
// Atomic addition
|
||||
atomicAdd(&g_odata[0], 10);
|
||||
|
||||
// Atomic subtraction (final should be 0)
|
||||
atomicSub(&g_odata[1], 10);
|
||||
const T tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
// Atomic exchange
|
||||
atomicExch(&g_odata[2], tid);
|
||||
}
|
||||
|
||||
__device__
|
||||
void testKernelSub(...) {}
|
||||
|
||||
template<
|
||||
typename T,
|
||||
typename enable_if<
|
||||
is_same<T, int>{} || is_same<T, unsigned int>{}>::type* = nullptr>
|
||||
void testKernelSub(T* g_odata) {
|
||||
// Atomic subtraction (final should be 0)
|
||||
atomicSub(&g_odata[1], 10);
|
||||
}
|
||||
|
||||
__device__
|
||||
void testKernelIntegral(...) {}
|
||||
|
||||
template<typename T, typename enable_if<is_integral<T>{}>::type* = nullptr>
|
||||
__device__
|
||||
void testKernelIntegral(T* g_odata) {
|
||||
// access thread id
|
||||
const T tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
// Atomic maximum
|
||||
atomicMax(&g_odata[3], tid);
|
||||
@@ -231,20 +254,21 @@ __global__ void testKernel(hipLaunchParm lp, int* g_odata) {
|
||||
|
||||
// Atomic XOR
|
||||
atomicXor(&g_odata[10], tid);
|
||||
|
||||
testKernelSub(g_odata);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__global__ void testKernel(T* g_odata) {
|
||||
// Atomic addition
|
||||
atomicAdd(&g_odata[0], 10);
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
printf("%s starting...\n", sampleName);
|
||||
|
||||
runTest(argc, argv);
|
||||
|
||||
hipDeviceReset();
|
||||
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
|
||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
testKernelIntegral(g_odata);
|
||||
testKernelExch(g_odata);
|
||||
}
|
||||
|
||||
void runTest(int argc, char** argv) {
|
||||
template<typename T>
|
||||
void runTest() {
|
||||
hipDeviceProp_t deviceProp;
|
||||
deviceProp.major = 0;
|
||||
deviceProp.minor = 0;
|
||||
@@ -262,10 +286,10 @@ void runTest(int argc, char** argv) {
|
||||
unsigned int numThreads = 256;
|
||||
unsigned int numBlocks = 64;
|
||||
unsigned int numData = 11;
|
||||
unsigned int memSize = sizeof(int) * numData;
|
||||
unsigned int memSize = sizeof(T) * numData;
|
||||
|
||||
// allocate mem for the result on host side
|
||||
int* hOData = (int*)malloc(memSize);
|
||||
T* hOData = (T*)malloc(memSize);
|
||||
|
||||
// initialize the memory
|
||||
for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
|
||||
@@ -274,13 +298,14 @@ void runTest(int argc, char** argv) {
|
||||
hOData[8] = hOData[10] = 0xff;
|
||||
|
||||
// allocate device memory for result
|
||||
int* dOData;
|
||||
T* dOData;
|
||||
hipMalloc((void**)&dOData, memSize);
|
||||
// copy host memory to device to initialize to zero
|
||||
hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice);
|
||||
|
||||
// execute the kernel
|
||||
hipLaunchKernel(testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData);
|
||||
hipLaunchKernelGGL(
|
||||
testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData);
|
||||
|
||||
// Copy result from device to host
|
||||
hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost);
|
||||
@@ -294,3 +319,18 @@ void runTest(int argc, char** argv) {
|
||||
|
||||
passed();
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
printf("%s starting...\n", sampleName);
|
||||
|
||||
runTest<int>();
|
||||
runTest<unsigned int>();
|
||||
runTest<unsigned long long>();
|
||||
runTest<float>();
|
||||
runTest<double>();
|
||||
|
||||
hipDeviceReset();
|
||||
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
|
||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
新しいイシューから参照
ユーザーをブロックする