From 8264d5d6bddecb3a6a0ab1285bf405a7a7ee003e Mon Sep 17 00:00:00 2001 From: 7SK Date: Tue, 21 Jun 2016 16:20:30 +0530 Subject: [PATCH] NVCC_COMPAT add support for both cuda compatible implementation and hcc(faster) implementation with test Change-Id: I79a22344f458391d7dffac5f147619a542e97e4e --- include/hcc_detail/hip_runtime.h | 5 +- src/device_util.cpp | 82 +++++++++++++----- tests/src/deviceLib/hip_anyall.cpp | 27 +++--- tests/src/deviceLib/hip_clz.cpp | 134 ++++++++--------------------- tests/src/deviceLib/hip_ffs.cpp | 16 +++- 5 files changed, 127 insertions(+), 137 deletions(-) diff --git a/include/hcc_detail/hip_runtime.h b/include/hcc_detail/hip_runtime.h index 21da60631a..22095b342d 100644 --- a/include/hcc_detail/hip_runtime.h +++ b/include/hcc_detail/hip_runtime.h @@ -37,9 +37,8 @@ THE SOFTWARE. #include #include - - - +// Define NVCC_COMPAT for CUDA compatibility +#define NVCC_COMPAT #define CUDA_SUCCESS hipSuccess #include diff --git a/src/device_util.cpp b/src/device_util.cpp index a307a1e377..ca9d858981 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -836,7 +836,7 @@ __device__ float erfcf(float x) } __device__ float erfcinvf(float y) { - return __hip_erfinvf(1 - y); + return __hip_erfinvf(1 - y); } __device__ float erfcxf(float x) { @@ -1697,75 +1697,111 @@ __device__ unsigned int test__popc(unsigned int input); __device__ unsigned int __popcll( unsigned long long int input) { - return hc::__popcount_u32_b64(input); + return hc::__popcount_u32_b64(input); } __device__ unsigned int __clz(unsigned int input) { - return hc::__firstbit_u32_u32( input); +#ifdef NVCC_COMPAT + return input == 0 ? 32 : hc::__firstbit_u32_u32( input); +#else + return hc::__firstbit_u32_u32( input); +#endif } __device__ unsigned int __clzll(unsigned long long int input) { - return hc::__firstbit_u32_u64( input); +#ifdef NVCC_COMPAT + return input == 0 ? 64 : hc::__firstbit_u32_u64( input); +#else + return hc::__firstbit_u32_u64( input); +#endif } -__device__ unsigned int __clz(int input) +__device__ unsigned int __clz( int input) { - return hc::__firstbit_u32_s32( input); +#ifdef NVCC_COMPAT + return input == 0 ? 32 : hc::__firstbit_u32_s32( input); +#else + return hc::__firstbit_u32_s32( input); +#endif } -__device__ unsigned int __clzll(long long int input) +__device__ unsigned int __clzll( long long int input) { - return hc::__firstbit_u32_s64( input); +#ifdef NVCC_COMPAT + return input == 0 ? 64 : hc::__firstbit_u32_s64( input); +#else + return hc::__firstbit_u32_s64( input); +#endif } __device__ unsigned int __ffs(unsigned int input) { - return hc::__lastbit_u32_u32( input)+1; +#ifdef NVCC_COMPAT + return hc::__lastbit_u32_u32( input)+1; +#else + return hc::__lastbit_u32_u32( input); +#endif } __device__ unsigned int __ffsll(unsigned long long int input) { - return hc::__lastbit_u32_u64( input)+1; +#ifdef NVCC_COMPAT + return hc::__lastbit_u32_u64( input)+1; +#else + return hc::__lastbit_u32_u64( input); +#endif } -__device__ unsigned int __ffs(int input) +__device__ unsigned int __ffs( int input) { - return hc::__lastbit_u32_s32( input)+1; +#ifdef NVCC_COMPAT + return hc::__lastbit_u32_s32( input)+1; +#else + return hc::__lastbit_u32_s32( input); +#endif } -__device__ unsigned int __ffsll(long long int input) +__device__ unsigned int __ffsll( long long int input) { - return hc::__lastbit_u32_s64( input)+1; +#ifdef NVCC_COMPAT + return hc::__lastbit_u32_s64( input)+1; +#else + return hc::__lastbit_u32_s64( input); +#endif } __device__ unsigned int __brev( unsigned int input) { - return hc::__bitrev_b32( input); + return hc::__bitrev_b32( input); } __device__ unsigned long long int __brevll( unsigned long long int input) { - return hc::__bitrev_b64( input); + return hc::__bitrev_b64( input); } // warp vote function __all __any __ballot __device__ int __all( int input) { - return hc::__all( input); + return hc::__all( input); } __device__ int __any( int input) { - if( hc::__any( input)!=0) return 1; - else return 0; +#ifdef NVCC_COMPAT + if( hc::__any( input)!=0) return 1; + else return 0; +#else + return hc::__any( input); +#endif } __device__ unsigned long long int __ballot( int input) { - return hc::__ballot( input); + return hc::__ballot( input); } // warp shuffle functions @@ -1809,11 +1845,11 @@ __device__ float __shfl_xor(float input, int lane_mask, int width) return hc::__shfl_xor(input,lane_mask,width); } -__host__ __device__ int min(int arg1, int arg2) -{ +__host__ __device__ int min(int arg1, int arg2) +{ return (int)(hc::precise_math::fmin((float)arg1, (float)arg2)); } -__host__ __device__ int max(int arg1, int arg2) +__host__ __device__ int max(int arg1, int arg2) { return (int)(hc::precise_math::fmax((float)arg1, (float)arg2)); } diff --git a/tests/src/deviceLib/hip_anyall.cpp b/tests/src/deviceLib/hip_anyall.cpp index 21e24d6443..2804e6211b 100644 --- a/tests/src/deviceLib/hip_anyall.cpp +++ b/tests/src/deviceLib/hip_anyall.cpp @@ -27,7 +27,7 @@ THE SOFTWARE. #include #define HIP_ASSERT(x) (assert((x)==hipSuccess)) -__global__ void +__global__ void warpvote(hipLaunchParm lp, int* device_any, int* device_all , int Num_Warps_per_Block, int pshift) { @@ -36,13 +36,11 @@ __global__ void device_all[hipThreadIdx_x>>pshift] = __all(tid -77); } - - int main(int argc, char *argv[]) { int warpSize, pshift; hipDeviceProp_t devProp; hipGetDeviceProperties(&devProp, 0); - if(strncmp(devProp.name,"Fiji",1)==0) + if(strncmp(devProp.name,"Fiji",1)==0) { warpSize =64; pshift =6; } @@ -53,14 +51,14 @@ int main(int argc, char *argv[]) int Num_Blocks_per_Grid = 1; int Num_Warps_per_Block = Num_Threads_per_Block/warpSize; int Num_Warps_per_Grid = (Num_Threads_per_Block*Num_Blocks_per_Grid)/warpSize; - + int * host_any = ( int*)malloc(Num_Warps_per_Grid*sizeof(int)); int * host_all = ( int*)malloc(Num_Warps_per_Grid*sizeof(int)); - int *device_any; + int *device_any; int *device_all; HIP_ASSERT(hipMalloc((void**)&device_any,Num_Warps_per_Grid*sizeof( int))); HIP_ASSERT(hipMalloc((void**)&device_all,Num_Warps_per_Grid*sizeof(int))); -for (int i=0; i #include "hip_runtime.h" - #define HIP_ASSERT(x) (assert((x)==hipSuccess)) - - -#define WIDTH 32 -#define HEIGHT 32 - +#define WIDTH 8 +#define HEIGHT 8 #define NUM (WIDTH*HEIGHT) #define THREADS_PER_BLOCK_X 8 @@ -43,41 +39,41 @@ THE SOFTWARE. unsigned int firstbit_u32(unsigned int a) { if (a == 0) +{ +#if defined (__HIP_PLATFORM_HCC__) && !defined ( NVCC_COMPAT ) + return -1; +#else + return 32; +#endif +} unsigned int pos = 0; while ((int )a > 0) { a <<= 1; pos++; } return pos; } -unsigned int firstbit_s32(int a) -{ - unsigned int u = a >= 0? a: ~a; // complement negative numbers - return firstbit_u32(u); -} unsigned int firstbit_u64(unsigned long long int a) { if (a == 0) +{ +#if defined (__HIP_PLATFORM_HCC__) && !defined ( NVCC_COMPAT ) return -1; +#else + return 64; +#endif +} unsigned int pos = 0; while ((long long int)a > 0) { a <<= 1; pos++; } return pos; } -unsigned int firstbit_s64(long long int a) -{ - unsigned long long int u = a >= 0? a: ~a; // complement negative numbers - return firstbit_u64(u); -} - - __global__ void HIP_kernel(hipLaunchParm lp, - unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, - unsigned int* e, int* f,unsigned int* g, long long int* h, int width, int height) + unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; @@ -87,14 +83,9 @@ HIP_kernel(hipLaunchParm lp, if ( i < (width * height)) { a[i] = __clz(b[i]); c[i] = __clzll(d[i]); - e[i] = __clz(f[i]); - g[i] = __clzll(h[i]); } - } - - using namespace std; int main() { @@ -103,19 +94,11 @@ int main() { unsigned int* hostB; unsigned int* hostC; unsigned long long int* hostD; - unsigned int* hostE; - int* hostF; - unsigned int* hostG; - long long int* hostH; unsigned int* deviceA; unsigned int* deviceB; unsigned int* deviceC; unsigned long long int* deviceD; - unsigned int* deviceE; - int* deviceF; - unsigned int* deviceG; - long long int* deviceH; hipDeviceProp_t devProp; hipGetDeviceProperties(&devProp, 0); @@ -125,57 +108,56 @@ int main() { cout << "hip Device prop succeeded " << endl ; - - int i; + unsigned int i; int errors; hostA = (unsigned int*)malloc(NUM * sizeof(unsigned int)); hostB = (unsigned int*)malloc(NUM * sizeof(unsigned int)); hostC = (unsigned int*)malloc(NUM * sizeof(unsigned int)); hostD = (unsigned long long int*)malloc(NUM * sizeof(unsigned long long int)); - hostE = (unsigned int*)malloc(NUM * sizeof(unsigned int)); - hostF = (int*)malloc(NUM * sizeof(int)); - hostG = (unsigned int*)malloc(NUM * sizeof(unsigned int)); - hostH = (long long int*)malloc(NUM * sizeof(long long int)); // initialize the input data for (i = 0; i < NUM; i++) { - hostB[i] = i; - hostD[i] = 1099511627776+i; - hostF[i] = -2100+i; - hostH[i] = 1099511627776+i; + hostB[i] = 419430*i; + hostD[i] = i; } HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(unsigned int))); HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(unsigned int))); HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(unsigned int))); HIP_ASSERT(hipMalloc((void**)&deviceD, NUM * sizeof(unsigned long long int))); - HIP_ASSERT(hipMalloc((void**)&deviceE, NUM * sizeof(unsigned int))); - HIP_ASSERT(hipMalloc((void**)&deviceF, NUM * sizeof(int))); - HIP_ASSERT(hipMalloc((void**)&deviceG, NUM * sizeof(unsigned int))); - HIP_ASSERT(hipMalloc((void**)&deviceH, NUM * sizeof(long long int))); HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(unsigned int), hipMemcpyHostToDevice)); HIP_ASSERT(hipMemcpy(deviceD, hostD, NUM*sizeof(unsigned long long int), hipMemcpyHostToDevice)); - HIP_ASSERT(hipMemcpy(deviceF, hostF, NUM*sizeof(int), hipMemcpyHostToDevice)); - HIP_ASSERT(hipMemcpy(deviceH, hostD, NUM*sizeof(long long int), hipMemcpyHostToDevice)); hipLaunchKernel(HIP_kernel, dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, - deviceA ,deviceB, deviceC,deviceD ,deviceE ,deviceF, deviceG,deviceH, WIDTH ,HEIGHT); + deviceA ,deviceB, deviceC ,deviceD , WIDTH ,HEIGHT); HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(unsigned int), hipMemcpyDeviceToHost)); HIP_ASSERT(hipMemcpy(hostC, deviceC, NUM*sizeof(unsigned int), hipMemcpyDeviceToHost)); - HIP_ASSERT(hipMemcpy(hostE, deviceE, NUM*sizeof(unsigned int), hipMemcpyDeviceToHost)); - HIP_ASSERT(hipMemcpy(hostG, deviceG, NUM*sizeof(unsigned int), hipMemcpyDeviceToHost)); // verify the results errors = 0; for (i = 0; i < NUM; i++) { - if (hostA[i] != firstbit_u32(hostB[i])) { + printf("gpu_clz =%d, cpu_clz =%d \n",hostA[i],firstbit_u32(hostB[i])); + if (hostA[i] != firstbit_u32(hostB[i])) { + errors++; + } + } + if (errors!=0) { + cout << "FAILED clz" << endl; + return -1; + } else { + cout << "__clz() checked!" << endl; + } + errors = 0; + for (i = 0; i < NUM; i++) { + printf("gpu_clzll =%d, cpu_clzll =%d \n",hostC[i],firstbit_u64(hostD[i])); + if (hostC[i] != firstbit_u64(hostD[i])) { errors++; } } @@ -183,43 +165,7 @@ int main() { cout << "FAILED clz" << endl; return -1; } else { - cout << "__clz_u() for unsigned checked!" << endl; - } - errors = 0; - for (i = 0; i < NUM; i++) { - if (hostC[i] != firstbit_u64(hostD[i])) { - errors++; - } - } - if (errors!=0) { - cout << "FAILED clz" << endl; - return -1; - } else { - cout << "__clzll_u() for unsigned checked!" << endl; - } - errors = 0; - for (i = 0; i < NUM; i++) { - if (hostE[i] != firstbit_s32(hostF[i])) { - errors++; - } - } - if (errors!=0) { - cout << "FAILED clz\n" << endl; - return -1; - } else { - cout << "__clz_s() checked!" << endl; - } - errors = 0; - for (i = 0; i < NUM; i++) { - if (hostG[i] != firstbit_s64(hostH[i])) { - errors++; - } - } - if (errors!=0) { - cout << "FAILED clz" << endl; - return -1; - } else { - cout << "__clzll_s() checked!" << endl; + cout << "__clzll() checked!" << endl; } cout << "clz test PASSED!" << endl; @@ -228,19 +174,11 @@ int main() { HIP_ASSERT(hipFree(deviceB)); HIP_ASSERT(hipFree(deviceC)); HIP_ASSERT(hipFree(deviceD)); - HIP_ASSERT(hipFree(deviceE)); - HIP_ASSERT(hipFree(deviceF)); - HIP_ASSERT(hipFree(deviceG)); - HIP_ASSERT(hipFree(deviceH)); free(hostA); free(hostB); free(hostC); free(hostD); - free(hostE); - free(hostF); - free(hostG); - free(hostH); return errors; } diff --git a/tests/src/deviceLib/hip_ffs.cpp b/tests/src/deviceLib/hip_ffs.cpp index a84ab7b268..77d31a6776 100644 --- a/tests/src/deviceLib/hip_ffs.cpp +++ b/tests/src/deviceLib/hip_ffs.cpp @@ -31,8 +31,8 @@ THE SOFTWARE. #define HIP_ASSERT(x) (assert((x)==hipSuccess)) -#define WIDTH 32 -#define HEIGHT 32 +#define WIDTH 8 +#define HEIGHT 8 #define NUM (WIDTH*HEIGHT) @@ -44,12 +44,20 @@ template int lastbit( T a) { if (a == 0) +#if defined (__HIP_PLATFORM_HCC__) && !defined ( NVCC_COMPAT ) + return -1; +#else return 0; +#endif int pos = 1; while ((a&1) != 1) { a >>= 1; pos++; } - return pos; +#if defined (__HIP_PLATFORM_HCC__) && !defined ( NVCC_COMPAT ) + return pos-1; +#else + return pos; +#endif } @@ -130,6 +138,7 @@ int main() { // verify the results errors = 0; for (i = 0; i < NUM; i++) { + printf("gpu_ffs =%d, cpu_ffs =%d \n",hostA[i],lastbit(hostB[i])); if (hostA[i] != lastbit(hostB[i])) { errors++; } @@ -142,6 +151,7 @@ int main() { } errors = 0; for (i = 0; i < NUM; i++) { + printf("gpu_ffsll =%d, cpu_ffsll =%d \n",hostC[i],lastbit(hostD[i])); if (hostC[i] != lastbit(hostD[i])) { errors++; }