diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index d3211ed3f5..924e774af0 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -381,6 +381,27 @@ __device__ void __threadfence_system(void) ; * @} */ +template::type f> +class Coordinates { + using R = decltype(f(0)); + + struct X { __device__ operator R() const { return f(0); } }; + struct Y { __device__ operator R() const { return f(1); } }; + struct Z { __device__ operator R() const { return f(2); } }; +public: + static constexpr X x{}; + static constexpr Y y{}; + static constexpr Z z{}; +}; + +static constexpr Coordinates blockDim; +static constexpr Coordinates blockIdx; +static constexpr Coordinates gridDim; +static constexpr Coordinates threadIdx; #define hipThreadIdx_x (hc_get_workitem_id(0)) #define hipThreadIdx_y (hc_get_workitem_id(1)) diff --git a/src/device_util.cpp b/src/device_util.cpp index 6afc797ec6..367a4c1a4f 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -45,8 +45,8 @@ __device__ void *__hip_hc_malloc(size_t size) { return (void*)nullptr; } - uint32_t totalThreads = hipBlockDim_x * hipGridDim_x * hipBlockDim_y * hipGridDim_y * hipBlockDim_z * hipGridDim_z; - uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x; + uint32_t totalThreads = blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z; + uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x; uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads; uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads; @@ -932,7 +932,7 @@ __device__ unsigned long long int atomicMax(unsigned long long int* address, template __device__ T atomicCAS_impl(T* address, T compare, T val) { - // the implementation assumes the atomic is lock-free and + // the implementation assumes the atomic is lock-free and // has the same size as the non-atmoic equivalent type static_assert(sizeof(T) == sizeof(std::atomic) , "size mismatch between atomic and non-atomic types"); @@ -945,7 +945,7 @@ __device__ T atomicCAS_impl(T* address, T compare, T val) T expected = compare; - // hcc should generate a system scope atomic CAS + // hcc should generate a system scope atomic CAS std::atomic_compare_exchange_weak_explicit(u.atomic_address , &expected, val , std::memory_order_acq_rel @@ -1110,8 +1110,8 @@ __device__ void* __get_dynamicgroupbaseptr() { return hc::get_dynamic_group_segment_base_pointer(); } -__host__ void* __get_dynamicgroupbaseptr() { - return nullptr; +__host__ void* __get_dynamicgroupbaseptr() { + return nullptr; } // Precise Math Functions diff --git a/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp b/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp index dac56bf709..c8c2e644c3 100644 --- a/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp +++ b/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. #define NUM_STREAMS 2 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; // Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below if(tx == 0){ for(int i = 0; i 0 __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) { - int tx = hipThreadIdx_x; + int tx = threadIdx.x; __half a = A[tx]; __half b = B[tx]; __half c = C[tx]; @@ -44,7 +44,7 @@ __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) { } __global__ void __half2Math(hipLaunchParm lp, __half2 *A, __half2 *B, __half2 *C) { - int tx = hipThreadIdx_x; + int tx = threadIdx.x; __half2 a = A[tx]; __half2 b = B[tx]; __half2 c = C[tx]; diff --git a/tests/src/deviceLib/hipThreadFence.cpp b/tests/src/deviceLib/hipThreadFence.cpp index 1bd9fbe02c..2f73b68529 100644 --- a/tests/src/deviceLib/hipThreadFence.cpp +++ b/tests/src/deviceLib/hipThreadFence.cpp @@ -33,7 +33,7 @@ THE SOFTWARE. __global__ void vAdd(hipLaunchParm lp, float *In1, float *In2, float *In3, float *In4, float *Out) { - int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tid = threadIdx.x + blockIdx.x * blockDim.x; In4[tid] = In1[tid] + In2[tid]; __threadfence(); In3[tid] = In3[tid] + In4[tid]; diff --git a/tests/src/deviceLib/hip_anyall.cpp b/tests/src/deviceLib/hip_anyall.cpp index bba7915052..f0b314ce8d 100644 --- a/tests/src/deviceLib/hip_anyall.cpp +++ b/tests/src/deviceLib/hip_anyall.cpp @@ -37,9 +37,9 @@ __global__ void warpvote(hipLaunchParm lp, int* device_any, int* device_all , int Num_Warps_per_Block, int pshift) { - int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - device_any[hipThreadIdx_x>>pshift] = __any(tid -77); - device_all[hipThreadIdx_x>>pshift] = __all(tid -77); + int tid = threadIdx.x + blockIdx.x * blockDim.x; + device_any[threadIdx.x>>pshift] = __any(tid -77); + device_all[threadIdx.x>>pshift] = __all(tid -77); } int main(int argc, char *argv[]) @@ -49,7 +49,7 @@ int main(int argc, char *argv[]) warpSize = devProp.warpSize; int w = warpSize; - pshift = 0; + pshift = 0; while (w >>= 1) ++pshift; printf ("warpSize=%d pshift=%d\n", warpSize, pshift); diff --git a/tests/src/deviceLib/hip_ballot.cpp b/tests/src/deviceLib/hip_ballot.cpp index 742c47a065..14b8f314a1 100644 --- a/tests/src/deviceLib/hip_ballot.cpp +++ b/tests/src/deviceLib/hip_ballot.cpp @@ -34,12 +34,12 @@ __global__ void gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift) { - int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - const unsigned int warp_num = hipThreadIdx_x >> pshift; + int tid = threadIdx.x + blockIdx.x * blockDim.x; + const unsigned int warp_num = threadIdx.x >> pshift; #ifdef __HIP_PLATFORM_HCC__ - atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); + atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); #else - atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); + atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); #endif } diff --git a/tests/src/deviceLib/hip_brev.cpp b/tests/src/deviceLib/hip_brev.cpp index 855a8bec47..c08c39dec9 100644 --- a/tests/src/deviceLib/hip_brev.cpp +++ b/tests/src/deviceLib/hip_brev.cpp @@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_clz.cpp b/tests/src/deviceLib/hip_clz.cpp index bdb31f3e8d..53fd611184 100644 --- a/tests/src/deviceLib/hip_clz.cpp +++ b/tests/src/deviceLib/hip_clz.cpp @@ -83,8 +83,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_ffs.cpp b/tests/src/deviceLib/hip_ffs.cpp index c855ede060..49530bb298 100644 --- a/tests/src/deviceLib/hip_ffs.cpp +++ b/tests/src/deviceLib/hip_ffs.cpp @@ -73,8 +73,8 @@ HIP_kernel(hipLaunchParm lp, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_popc.cpp b/tests/src/deviceLib/hip_popc.cpp index e503e55b42..19dafb4d43 100644 --- a/tests/src/deviceLib/hip_popc.cpp +++ b/tests/src/deviceLib/hip_popc.cpp @@ -64,8 +64,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_test_ldg.cpp b/tests/src/deviceLib/hip_test_ldg.cpp index 171ff1afd0..63d50e881e 100644 --- a/tests/src/deviceLib/hip_test_ldg.cpp +++ b/tests/src/deviceLib/hip_test_ldg.cpp @@ -57,8 +57,8 @@ vectoradd_float(hipLaunchParm lp, T* a, const T* bm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/tests/src/deviceLib/hip_test_make_type.cpp b/tests/src/deviceLib/hip_test_make_type.cpp index ef493ac923..6eba236e12 100644 --- a/tests/src/deviceLib/hip_test_make_type.cpp +++ b/tests/src/deviceLib/hip_test_make_type.cpp @@ -40,13 +40,13 @@ THE SOFTWARE. #define THREADS_PER_BLOCK_Z 1 -__global__ void +__global__ void vectoradd_char1(hipLaunchParm lp, - char1* a, const char1* bm, const char1* cm, int width, int height) + char1* a, const char1* bm, const char1* cm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { @@ -54,40 +54,40 @@ vectoradd_char1(hipLaunchParm lp, } } -__global__ void +__global__ void vectoradd_char2(hipLaunchParm lp, - char2* a, const char2* bm, const char2* cm, int width, int height) + char2* a, const char2* bm, const char2* cm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { a[i] = make_char2(bm[i].x, bm[i].y) + make_char2(cm[i].x, cm[i].y); } -} +} -__global__ void +__global__ void vectoradd_char3(hipLaunchParm lp, - char3* a, const char3* bm, const char3* cm, int width, int height) + char3* a, const char3* bm, const char3* cm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { a[i] = make_char3(bm[i].x, bm[i].y, bm[i].z) + make_char3(cm[i].x, cm[i].y, cm[i].z); } } -__global__ void +__global__ void vectoradd_char4(hipLaunchParm lp, - char4* a, const char4* bm, const char4* cm, int width, int height) + char4* a, const char4* bm, const char4* cm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { @@ -99,7 +99,7 @@ vectoradd_char4(hipLaunchParm lp, #if 0 __kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) { - + int x = blockDimX * blockIdx.x + threadIdx.x; int y = blockDimY * blockIdy.y + threadIdx.y; @@ -128,21 +128,21 @@ bool dataTypesRun(){ hostA = (T*)malloc(NUM * sizeof(T)); hostB = (T*)malloc(NUM * sizeof(T)); hostC = (T*)malloc(NUM * sizeof(T)); - + // initialize the input data for (i = 0; i < NUM; i++) { hostB[i] = (T)i; hostC[i] = (T)i; } - + HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T))); HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T))); HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T))); - + HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), + hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, @@ -192,21 +192,21 @@ bool dataTypesRun(){ hostA = (T*)malloc(NUM * sizeof(T)); hostB = (T*)malloc(NUM * sizeof(T)); hostC = (T*)malloc(NUM * sizeof(T)); - + // initialize the input data for (i = 0; i < NUM; i++) { hostB[i] = (T)i; hostC[i] = (T)i; } - + HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T))); HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T))); HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T))); - + HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), + hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, @@ -256,21 +256,21 @@ bool dataTypesRun(){ hostA = (T*)malloc(NUM * sizeof(T)); hostB = (T*)malloc(NUM * sizeof(T)); hostC = (T*)malloc(NUM * sizeof(T)); - + // initialize the input data for (i = 0; i < NUM; i++) { hostB[i] = (T)i; hostC[i] = (T)i; } - + HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T))); HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T))); HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T))); - + HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), + hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, @@ -319,21 +319,21 @@ bool dataTypesRunChar4(){ hostA = (T*)malloc(NUM * sizeof(T)); hostB = (T*)malloc(NUM * sizeof(T)); hostC = (T*)malloc(NUM * sizeof(T)); - + // initialize the input data for (i = 0; i < NUM; i++) { hostB[i] = (T)i; hostC[i] = (T)i; } - + HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T))); HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T))); HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T))); - + HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), + hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, @@ -368,7 +368,7 @@ bool dataTypesRunChar4(){ } int main() { - + hipDeviceProp_t devProp; hipGetDeviceProperties(&devProp, 0); cout << " System minor " << devProp.minor << endl; diff --git a/tests/src/deviceLib/hip_trig.cpp b/tests/src/deviceLib/hip_trig.cpp index 5ec28101f3..6ee8dc58ad 100644 --- a/tests/src/deviceLib/hip_trig.cpp +++ b/tests/src/deviceLib/hip_trig.cpp @@ -36,7 +36,7 @@ THE SOFTWARE. #define SIZE LEN<<2 __global__ void kernel_trig(hipLaunchParm lp, float *In, float *sin_d, float *cos_d, float *tan_d, float *sin_pd, float *cos_pd){ - int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tid = threadIdx.x + blockIdx.x * blockDim.x; sin_d[tid] = __sinf(In[tid]); cos_d[tid] = __cosf(In[tid]); tan_d[tid] = __tanf(In[tid]); diff --git a/tests/src/experimental/xcompile/hHip.c b/tests/src/experimental/xcompile/hHip.c index 2ac4ebc73e..17e7e9ecf6 100644 --- a/tests/src/experimental/xcompile/hHip.c +++ b/tests/src/experimental/xcompile/hHip.c @@ -29,7 +29,7 @@ THE SOFTWARE. __global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd, size_t len) { - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; if(tx < len) { Cd[tx] = Ad[tx] + Bd[tx]; diff --git a/tests/src/experimental/xcompile/hipxxKer.cpp b/tests/src/experimental/xcompile/hipxxKer.cpp index d1bbed63cd..5dca6c1bca 100644 --- a/tests/src/experimental/xcompile/hipxxKer.cpp +++ b/tests/src/experimental/xcompile/hipxxKer.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. __global__ void Kern(hipLaunchParm lp, float *A) { - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; A[tx] += 1.0f; } diff --git a/tests/src/experimental/xcompile/hxxHip.cpp b/tests/src/experimental/xcompile/hxxHip.cpp index 6a748d5c89..bca5d64afc 100644 --- a/tests/src/experimental/xcompile/hxxHip.cpp +++ b/tests/src/experimental/xcompile/hxxHip.cpp @@ -33,7 +33,7 @@ class memManager; template __global__ void Add(hipLaunchParm lp, T* Ad, T* Bd, T* Cd, size_t Len) { - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; if(tx < Len) { Cd[tx] = Ad[tx] + Bd[tx]; diff --git a/tests/src/hipC.c b/tests/src/hipC.c index 644df6c98f..efa03bb909 100644 --- a/tests/src/hipC.c +++ b/tests/src/hipC.c @@ -34,7 +34,7 @@ THE SOFTWARE. #define SIZE 1024*1024*sizeof(int) __global__ void Iter(hipLaunchParm lp, int *Ad){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; if(tx == 0){ for(int i=0;i(my_sdata); #endif - size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t tid = hipThreadIdx_x; + size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); + size_t tid = threadIdx.x; // initialize dynamic shared memory if (tid < groupElements) { diff --git a/tests/src/kernel/hipDynamicShared2.cpp b/tests/src/kernel/hipDynamicShared2.cpp index 95e70a9956..4567ff6fc2 100644 --- a/tests/src/kernel/hipDynamicShared2.cpp +++ b/tests/src/kernel/hipDynamicShared2.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. __global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) { HIP_DYNAMIC_SHARED(float, sBd); - int tx = hipThreadIdx_x; + int tx = threadIdx.x; for(int i=0;i syncMsg = {"event", "stream", "device"}; void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMethod, std::string msg) { - std::cerr << "test: CheckHostPointer " << msg + std::cerr << "test: CheckHostPointer " << msg //<< " HIP_COHERENT_HOST_ALLOC=" << HIP_COHERENT_HOST_ALLOC //<< " HIP_EVENT_SYS_RELEASE=" << HIP_EVENT_SYS_RELEASE - << " eventFlags = " << std::hex << eventFlags - << ((eventFlags & hipEventReleaseToDevice) ? " hipEventReleaseToDevice" : "") - << ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "") - << " ptr=" << ptr + << " eventFlags = " << std::hex << eventFlags + << ((eventFlags & hipEventReleaseToDevice) ? " hipEventReleaseToDevice" : "") + << ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "") + << " ptr=" << ptr << " syncMethod=" << syncMsg[syncMethod] << "\n"; hipStream_t s; @@ -93,7 +93,7 @@ void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMe default: assert(0); }; - + for (int i=0; i __global__ void Inc(hipLaunchParm lp, float *Ad){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; Ad[tx] = Ad[tx] + float(1); } template -void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internalRegister) +void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internalRegister) { A = A + offset; numElements -= offset; @@ -56,7 +56,7 @@ void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internal HIPCHECK(hipMemset(Bd, 13.0f, sizeBytes)); - // + // HIPCHECK(hipMemcpy(Bd, A, sizeBytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(Bh, Bd, sizeBytes, hipMemcpyDeviceToHost)); @@ -81,7 +81,7 @@ int main(int argc, char *argv[]) const size_t size = N * sizeof(float); - if (p_tests & 0x1) { + if (p_tests & 0x1) { float *A, **Ad; int num_devices; HIPCHECK(hipGetDeviceCount(&num_devices)); @@ -115,7 +115,7 @@ int main(int argc, char *argv[]) } - if (p_tests & 0x6) { + if (p_tests & 0x6) { // Sensitize HIP bug if device does not match where the memory was registered. HIPCHECK(hipSetDevice(0)); @@ -129,7 +129,7 @@ int main(int argc, char *argv[]) HIPCHECK(hipMalloc(&Bd, size)); // TODO - set to 128 -#define OFFSETS_TO_TRY 128 +#define OFFSETS_TO_TRY 128 assert (N>OFFSETS_TO_TRY); if (p_tests & 0x2) { diff --git a/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp b/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp index 22bd30689a..5cd46c808a 100644 --- a/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp @@ -59,7 +59,7 @@ struct HostTraits static const char *Name() { return "Pinned"; } ; static void *Alloc(size_t sizeBytes) { - void *p; + void *p; HIPCHECK(hipHostMalloc((void**)&p, sizeBytes, hipHostMallocDefault)); return p; }; @@ -67,11 +67,11 @@ struct HostTraits template -__global__ void +__global__ void addK (hipLaunchParm lp, T *A, T K, size_t numElements) { - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; for (size_t i=offset; i -void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide) +void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide) { HIPASSERT(numElements % numInflight == 0); // Must be evenly divisible. size_t Nbytes = numElements*sizeof(T); @@ -95,7 +95,7 @@ void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); printf ("-----------------------------------------------------------------------------------------------\n"); - printf ("testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n", + printf ("testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n", __func__, HostTraits::Name(), Nbytes, (double)(Nbytes)/1024.0/1024.0, numPongs, numInflight, eachCopyElements, eachCopyBytes); T *A_h = NULL; @@ -176,7 +176,7 @@ void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies, size_t eachCopyBytes = eachCopyElements * sizeof(T); printf ("-----------------------------------------------------------------------------------------------\n"); - printf ("testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n", + printf ("testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n", __func__, Nbytes, (double)(Nbytes)/1024.0/1024.0, numCopies, eachCopyElements, eachCopyBytes); T *A_d; @@ -194,7 +194,7 @@ void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies, //stream=0; // fixme TODO - for (int i=0; i=0; i--) { diff --git a/tests/src/runtimeApi/module/hipModule.cpp b/tests/src/runtimeApi/module/hipModule.cpp index 1b7b62cff2..f2c2137738 100644 --- a/tests/src/runtimeApi/module/hipModule.cpp +++ b/tests/src/runtimeApi/module/hipModule.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. #define kernel_name "hello_world" __global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){ - int tx = hipThreadIdx_x; + int tx = threadIdx.x; Bd[tx] = Ad[tx]; } diff --git a/tests/src/runtimeApi/module/vcpy_kernel.cpp b/tests/src/runtimeApi/module/vcpy_kernel.cpp index 0375eee342..7ee1ad333b 100644 --- a/tests/src/runtimeApi/module/vcpy_kernel.cpp +++ b/tests/src/runtimeApi/module/vcpy_kernel.cpp @@ -24,7 +24,7 @@ THE SOFTWARE. extern "C" __global__ void hello_world(hipLaunchParm lp, float *a, float *b) { - int tx = hipThreadIdx_x; + int tx = threadIdx.x; b[tx] = a[tx]; } diff --git a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp index 43a3e9bdea..3727901645 100644 --- a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp +++ b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp @@ -35,7 +35,7 @@ THE SOFTWARE. template __global__ void Inc(hipLaunchParm lp, T *Array){ -int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; +int tx = threadIdx.x + blockIdx.x * blockDim.x; Array[tx] = Array[tx] + T(1); } @@ -116,7 +116,7 @@ int main(int argc, char **argv) } const size_t size = N * sizeof(float); - + for (int i=0; i< iterations; i++) { std::thread t1(run1, size, stream[0]); @@ -126,7 +126,7 @@ int main(int argc, char **argv) // std::cout<<"T1"< __global__ void Inc(hipLaunchParm lp, T *In){ -int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; +int tx = threadIdx.x + blockIdx.x * blockDim.x; In[tx] = In[tx] + 1; } @@ -94,12 +94,12 @@ void initArrays(T **Ad, T **Ah, } template -void initArrays(T **Ad, size_t N, - bool deviceMemory = false, +void initArrays(T **Ad, size_t N, + bool deviceMemory = false, bool usePinnedHost = false){ size_t NBytes = N * sizeof(T); if(deviceMemory){ - HIPCHECK( hipMalloc(Ad, NBytes)); + HIPCHECK( hipMalloc(Ad, NBytes)); }else{ if(usePinnedHost){ HIPCHECK(hipHostMalloc((void**)Ad, NBytes, hipHostMallocDefault)); diff --git a/tests/src/runtimeApi/synchronization/copy_coherency.cpp b/tests/src/runtimeApi/synchronization/copy_coherency.cpp index 2e514e1e3a..b2a66f61e2 100644 --- a/tests/src/runtimeApi/synchronization/copy_coherency.cpp +++ b/tests/src/runtimeApi/synchronization/copy_coherency.cpp @@ -27,14 +27,14 @@ THE SOFTWARE. // TODO - add code object support here. /* HIT_START * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 - * RUN: %t + * RUN: %t * HIT_END */ // Test cache management (fences) and synchronization between kernel and copy commands. -// Exhaustively tests 3 command types (copy, kernel, module kernel), -// many sync types (see SyncType), followed by another command, across a sweep +// Exhaustively tests 3 command types (copy, kernel, module kernel), +// many sync types (see SyncType), followed by another command, across a sweep // of data sizes designed to stress various levels of the memory hierarchy. #include "hip/hip_runtime.h" @@ -102,8 +102,8 @@ MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel"); __global__ void memsetIntKernel(int * ptr, const int val, size_t numElements) { - int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - int stride = hipBlockDim_x * hipGridDim_x ; + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x ; for (size_t i= gid; i< numElements; i+=stride){ ptr[i] = val; } @@ -112,15 +112,15 @@ memsetIntKernel(int * ptr, const int val, size_t numElements) __global__ void memcpyIntKernel(int *dst, const int * src, size_t numElements) { - int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - int stride = hipBlockDim_x * hipGridDim_x ; + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x ; for (size_t i= gid; i< numElements; i+=stride){ dst[i] = src[i]; } }; -// CHeck arrays in reverse order, to more easily detect cases where +// CHeck arrays in reverse order, to more easily detect cases where // the copy is "partially" done. void checkReverse(const int *ptr, int numElements, int expected) { int mismatchCnt = 0; @@ -157,7 +157,7 @@ const char * CmdTypeStr(CmdType c) ENUM_CASE_STR(MODULE_KERNEL); default: return "UNKNOWN"; }; -} +} enum SyncType { @@ -223,16 +223,16 @@ void resetInputs( int * Ad, int * Bd, int *Cd, int *Ch, size_t numElements, int // Intended to test proper synchronization and cache flushing between CMDA and CMDB. // CMD are of type CmdType. All command copy memory, using either hipMemcpyAsync or kernel implementations. -// CmdA copies from Ad to Bd, +// CmdA copies from Ad to Bd, // Some form of synchronization is applied. // Then cmdB copies from Bd to Cd. // // Cd is then copied to host Ch using a memory copy. // // Correct result at the end is that Ch contains the contents originally in Ad (integer 0x42) -void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType, +void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType, hipStream_t stream1, hipStream_t stream2, int numElements, - int * Ad, int * Bd, int *Cd, int *Ch, + int * Ad, int * Bd, int *Cd, int *Ch, int expected) { hipEvent_t e; @@ -241,14 +241,14 @@ void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType, resetInputs(Ad, Bd, Cd, Ch, numElements, expected); const size_t sizeElements = numElements * sizeof(int); - fprintf (stderr, "test: runTest with %zu bytes (%6.2f MB) cmdA=%s; sync=%s; cmdB=%s\n", + fprintf (stderr, "test: runTest with %zu bytes (%6.2f MB) cmdA=%s; sync=%s; cmdB=%s\n", sizeElements, (double) (sizeElements/1024.0), CmdTypeStr(cmdAType), SyncTypeStr(syncType), CmdTypeStr(cmdBType)); if (SKIP_MODULE_KERNEL && ((cmdAType == MODULE_KERNEL) || (cmdBType == MODULE_KERNEL))) { fprintf (stderr, "warn: skipping since test infra does not yet support modules\n"); return; } - + // Step A: runCmd(cmdAType, Bd, Ad, stream1, numElements); @@ -334,7 +334,7 @@ void testWrapper(size_t numElements) fprintf (stderr, "test: init complete, start running tests\n"); - runTestImpl(COPY, EVENT_SYNC, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected); + runTestImpl(COPY, EVENT_SYNC, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected); for (int cmdA=0; cmdA - + extern "C" __global__ void memcpyIntKernel(hipLaunchParm lp, int *dst, const int * src, size_t numElements) { - int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - int stride = hipBlockDim_x * hipGridDim_x ; + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x ; for (size_t i= gid; i< numElements; i+=stride){ dst[i] = src[i]; } diff --git a/tests/src/stress/hipStressAsync.cpp b/tests/src/stress/hipStressAsync.cpp index e06e16809c..a142b41730 100644 --- a/tests/src/stress/hipStressAsync.cpp +++ b/tests/src/stress/hipStressAsync.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. #define ITER 1<<10 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; if(tx == 0){ for(int i = 0; i=0; i-=stride) { C_d[i] = A_d[i] + B_d[i]; @@ -169,8 +169,8 @@ addCount( const T *A_d, size_t NELEM, int count) { - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; // Deliberately do this in an inefficient way to increase kernel runtime for (int i=0; i=0; i-=stride) { C_d[i] = val; @@ -219,12 +219,12 @@ void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h) { // Initialize the host data: for (size_t i=0; i void initArrays(T **A_d, T **B_d, T **C_d, - T **A_h, T **B_h, T **C_h, - size_t N, bool usePinnedHost=false) + T **A_h, T **B_h, T **C_h, + size_t N, bool usePinnedHost=false) { size_t Nbytes = N*sizeof(T); @@ -317,7 +317,7 @@ void freeArraysForHost(T *A_h, T *B_h, T *C_h, bool usePinnedHost) template void freeArrays(T *A_d, T *B_d, T *C_d, - T *A_h, T *B_h, T *C_h, bool usePinnedHost) + T *A_h, T *B_h, T *C_h, bool usePinnedHost) { if (A_d) { HIPCHECK ( hipFree(A_d) ); @@ -453,9 +453,9 @@ struct Pinned { static const bool isPinned = true; static const char *str() { return "Pinned"; }; - static void *Alloc(size_t sizeBytes) + static void *Alloc(size_t sizeBytes) { - void *p; + void *p; HIPCHECK(hipHostMalloc((void**)&p, sizeBytes)); return p; }; @@ -463,12 +463,12 @@ struct Pinned { //--- -struct Unpinned +struct Unpinned { static const bool isPinned = false; static const char *str() { return "Unpinned"; }; - static void *Alloc(size_t sizeBytes) + static void *Alloc(size_t sizeBytes) { void *p = malloc (sizeBytes); HIPASSERT(p); @@ -496,7 +496,7 @@ template<> struct MemTraits { - static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) + static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) { HIPCHECK(hipMemcpy(dest, src, sizeBytes, kind)); } @@ -507,7 +507,7 @@ template<> struct MemTraits { - static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) + static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) { HIPCHECK(hipMemcpyAsync(dest, src, sizeBytes, kind, stream)); } diff --git a/tests/src/texture/hipTextureObj2D.cpp b/tests/src/texture/hipTextureObj2D.cpp index 443d708418..9ddafd6b1c 100644 --- a/tests/src/texture/hipTextureObj2D.cpp +++ b/tests/src/texture/hipTextureObj2D.cpp @@ -17,8 +17,8 @@ __global__ void tex2DKernel(float* outputData, int width, int height) { - int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; - int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; + int x = blockIdx.x*blockDim.x + threadIdx.x; + int y = blockIdx.y*blockDim.y + threadIdx.y; outputData[y*width + x] = tex2D(textureObject, x, y); } diff --git a/tests/src/texture/hipTextureRef2D.cpp b/tests/src/texture/hipTextureRef2D.cpp index eb27b23230..4430ca722d 100644 --- a/tests/src/texture/hipTextureRef2D.cpp +++ b/tests/src/texture/hipTextureRef2D.cpp @@ -18,8 +18,8 @@ __global__ void tex2DKernel(float* outputData, int width, int height) { - int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; - int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; + int x = blockIdx.x*blockDim.x + threadIdx.x; + int y = blockIdx.y*blockDim.y + threadIdx.y; #ifdef __HIP_PLATFORM_HCC__ outputData[y*width + x] = tex2D(tex, textureObject, x, y); #else