Revert "Revert adoption of CUDA indexing in general - this can only work with later versions of the compiler, just like module based dispatch, and thus must be guarded against usage in earlier (e.g. 1.6) versions."

This reverts commit d2fd1f5
2017-11-29 21:36:29 +00:00
parent b881cf713c
commit fbaf729f88
62 changed files with 152 additions and 152 deletions
@@ -40,8 +40,8 @@ __global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T*
    T *sdata = reinterpret_cast<T *>(my_sdata);
 #endif

-    size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t tid = threadIdx.x;
+    size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t tid = hipThreadIdx_x;

    // initialize dynamic shared memory
    if (tid < groupElements) {
@@ -34,7 +34,7 @@ THE SOFTWARE.

 __global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) {
  HIP_DYNAMIC_SHARED(float, sBd);
-  int tx = threadIdx.x;
+  int tx = hipThreadIdx_x;
  for(int i=0;i<LEN/64;i++) {
    sBd[tx + i * 64] = Ad[tx + i * 64] + 1.0f;
    Bd[tx + i * 64] = sBd[tx + i * 64];
@@ -48,8 +48,8 @@ vectorADD2( hipLaunchParm lp,
            T *C_d,
            size_t N)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;

    for (size_t i=offset; i<N; i+=stride) {
        double foo = __hiloint2double(A_d[i], B_d[i]);
@@ -62,7 +62,7 @@ __global__ void MyKernel (const hipLaunchParm lp, const float *a, const float *b
 {
    //KERNELBEGIN;

-    unsigned gid = threadIdx.x;
+    unsigned gid = hipThreadIdx_x;
    if (gid < N) {
        c[gid] = a[gid] + PlusOne(b[gid]);
    }
@@ -96,7 +96,7 @@ vectorADD(const hipLaunchParm lp,
    int zuzu = deviceVar + 1;


-    int b = threadIdx.x;
+    int b = hipThreadIdx_x;
    int c;
 #ifdef NOT_YET
    int a = __shfl_up(x, 1);
@@ -119,8 +119,8 @@ vectorADD(const hipLaunchParm lp,
    __syncthreads();


-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;

    for (size_t i=offset; i<N; i+=stride) {
 		C_d[i] = A_d[i] + B_d[i];
@@ -38,7 +38,7 @@ __constant__ int Value[LEN];

 __global__ void Get(hipLaunchParm lp, int *Ad)
 {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    Ad[tid] = Value[tid];
 }

@@ -33,12 +33,12 @@ THE SOFTWARE.
 #define SIZE NUM * 8

 __global__ void Alloc(hipLaunchParm lp, uint64_t *Ptr) {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    Ptr[tid] = (uint64_t)malloc(128);
 }

 __global__ void Free(hipLaunchParm lp, uint64_t *Ptr) {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    free((void*)Ptr[tid]);
 }

@@ -35,52 +35,52 @@ THE SOFTWARE.
 #define LEN12 12 * 4

 __global__ void MemCpy8(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memcpy(Out + tid*8, In + tid*8, 8);
 }

 __global__ void MemCpy9(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memcpy(Out + tid*9, In + tid*9, 9);
 }

 __global__ void MemCpy10(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memcpy(Out + tid*10, In + tid*10, 10);
 }

 __global__ void MemCpy11(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memcpy(Out + tid*11, In + tid*11, 11);
 }

 __global__ void MemCpy12(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memcpy(Out + tid*12, In + tid*12, 12);
 }

 __global__ void MemSet8(hipLaunchParm lp, uint8_t *In) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memset(In + tid*8, 1, 8);
 }

 __global__ void MemSet9(hipLaunchParm lp, uint8_t *In) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memset(In + tid*9, 1, 9);
 }

 __global__ void MemSet10(hipLaunchParm lp, uint8_t *In) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memset(In + tid*10, 1, 10);
 }

 __global__ void MemSet11(hipLaunchParm lp, uint8_t *In) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memset(In + tid*11, 1, 11);
 }

 __global__ void MemSet12(hipLaunchParm lp, uint8_t *In) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memset(In + tid*12, 1, 12);
 }

@@ -37,7 +37,7 @@ __global__ void vadd_asm(hipLaunchParm lp,
                                float *out,
                                float *in)
 {
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;

 #ifdef __HIP_PLATFORM_NVCC__
        asm volatile("add.f32 %0,%1,%2;":"=f"(out[i]):"f"(in[i]),"f"(out[i]));
@@ -35,7 +35,7 @@ __global__ void vmac_asm(hipLaunchParm lp,
                                float *out,
                                float *in)
 {
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;

    asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i]));
 }
@@ -33,7 +33,7 @@ void
 __launch_bounds__(256, 2)
 myKern(hipLaunchParm lp, int *C, const int *A, int N, int xfactor)
 {
-    int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+    int tid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);

    if (tid < N) {
        C[tid] = A[tid];