From fbaf729f88353841ca2eb4f9b24f593bbb3b14ce Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Wed, 29 Nov 2017 21:36:29 +0000
Subject: [PATCH] Revert "Revert adoption of CUDA indexing in general - this
 can only work with later versions of the compiler, just like module based
 dispatch, and thus must be guarded against usage in earlier (e.g. 1.6)
 versions."

This reverts commit d2fd1f5
---
 samples/0_Intro/square/square.cu              |  4 +--
 src/device_util.cpp                           |  4 +--
 src/hip_memory.cpp                            |  4 +--
 .../device/hipFuncDeviceSynchronize.cpp       |  2 +-
 tests/src/deviceLib/hipComplex.cpp            |  2 +-
 tests/src/deviceLib/hipDeviceMemcpy.cpp       |  4 +--
 tests/src/deviceLib/hipFloatMath.cpp          |  2 +-
 tests/src/deviceLib/hipSimpleAtomicsTest.cpp  |  2 +-
 tests/src/deviceLib/hipTestDevice.cpp         | 32 +++++++++----------
 tests/src/deviceLib/hipTestDeviceDouble.cpp   | 28 ++++++++--------
 tests/src/deviceLib/hipTestDeviceSymbol.cpp   |  2 +-
 tests/src/deviceLib/hipTestHalf.cpp           |  4 +--
 tests/src/deviceLib/hipThreadFence.cpp        |  2 +-
 tests/src/deviceLib/hip_anyall.cpp            |  6 ++--
 tests/src/deviceLib/hip_ballot.cpp            |  8 ++---
 tests/src/deviceLib/hip_brev.cpp              |  4 +--
 tests/src/deviceLib/hip_clz.cpp               |  4 +--
 tests/src/deviceLib/hip_ffs.cpp               |  4 +--
 tests/src/deviceLib/hip_popc.cpp              |  4 +--
 tests/src/deviceLib/hip_test_ldg.cpp          |  4 +--
 tests/src/deviceLib/hip_test_make_type.cpp    | 20 ++++++------
 tests/src/deviceLib/hip_trig.cpp              |  2 +-
 tests/src/experimental/xcompile/hHip.c        |  2 +-
 tests/src/experimental/xcompile/hipxxKer.cpp  |  2 +-
 tests/src/experimental/xcompile/hxxHip.cpp    |  2 +-
 tests/src/hipC.c                              |  2 +-
 tests/src/hipC.cpp                            |  2 +-
 tests/src/hipCKernel.c                        |  2 +-
 tests/src/kernel/hipDynamicShared.cpp         |  4 +--
 tests/src/kernel/hipDynamicShared2.cpp        |  2 +-
 tests/src/kernel/hipGridLaunch.cpp            |  4 +--
 tests/src/kernel/hipLanguageExtensions.cpp    |  8 ++---
 tests/src/kernel/hipTestConstant.cpp          |  2 +-
 tests/src/kernel/hipTestMallocKernel.cpp      |  4 +--
 tests/src/kernel/hipTestMemKernel.cpp         | 20 ++++++------
 tests/src/kernel/inline_asm_vadd.cpp          |  2 +-
 tests/src/kernel/inline_asm_vmac.cpp          |  2 +-
 tests/src/kernel/launch_bounds.cpp            |  2 +-
 .../device/hipDeviceSynchronize.cpp           |  2 +-
 .../src/runtimeApi/memory/hipHostGetFlags.cpp |  2 +-
 tests/src/runtimeApi/memory/hipHostMalloc.cpp |  4 +--
 .../src/runtimeApi/memory/hipHostRegister.cpp |  2 +-
 .../src/runtimeApi/memory/hipMemcpyAsync.cpp  |  4 +--
 .../memory/hipMemoryAllocateCoherent.cpp      |  2 +-
 .../runtimeApi/memory/p2p_copy_coherency.cpp  |  8 ++---
 tests/src/runtimeApi/module/hipModule.cpp     |  2 +-
 tests/src/runtimeApi/module/vcpy_kernel.cpp   |  2 +-
 .../multiThread/hipMultiThreadStreams2.cpp    |  2 +-
 .../runtimeApi/stream/hipAPIStreamDisable.cpp |  4 +--
 .../runtimeApi/stream/hipAPIStreamEnable.cpp  |  4 +--
 tests/src/runtimeApi/stream/hipNullStream.cpp |  4 +--
 tests/src/runtimeApi/stream/hipStream.h       |  2 +-
 .../synchronization/copy_coherency.cpp        |  8 ++---
 .../synchronization/memcpyInt.device.cpp      |  4 +--
 tests/src/specialFunc.cu                      |  2 +-
 tests/src/stress/hipStressAsync.cpp           |  2 +-
 tests/src/stress/hipStressChain.cpp           |  2 +-
 tests/src/stress/hipStressKernel.cpp          |  2 +-
 tests/src/stress/hipStressSync.cpp            |  2 +-
 tests/src/test_common.h                       | 20 ++++++------
 tests/src/texture/hipTextureObj2D.cpp         |  4 +--
 tests/src/texture/hipTextureRef2D.cpp         |  4 +--
 62 files changed, 152 insertions(+), 152 deletions(-)

diff --git a/samples/0_Intro/square/square.cu b/samples/0_Intro/square/square.cu
index 82b31db14a..ccaa9ae0bc 100644
--- a/samples/0_Intro/square/square.cu
+++ b/samples/0_Intro/square/square.cu
@@ -40,8 +40,8 @@ template <typename T>
 __global__ void
 vector_square(T *C_d, const T *A_d, size_t N)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
 
     for (size_t i=offset; i<N; i+=stride) {
         C_d[i] = A_d[i] * A_d[i];
diff --git a/src/device_util.cpp b/src/device_util.cpp
index b6aebdfce0..298c8b3725 100644
--- a/src/device_util.cpp
+++ b/src/device_util.cpp
@@ -45,8 +45,8 @@ __device__ void *__hip_hc_malloc(size_t size)
     {
         return (void*)nullptr;
     }
-    uint32_t totalThreads = blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z;
-    uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x;
+    uint32_t totalThreads = hipBlockDim_x * hipGridDim_x * hipBlockDim_y * hipGridDim_y * hipBlockDim_z * hipGridDim_z;
+    uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x;
 
     uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads;
     uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads;
diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
index c88a1dabc1..59db00796d 100644
--- a/src/hip_memory.cpp
+++ b/src/hip_memory.cpp
@@ -1307,9 +1307,9 @@ namespace
     __global__
     void hip_fill_n(RandomAccessIterator f, N n, T value)
     {
-        const uint32_t grid_dim = gridDim.x * blockDim.x;
+        const uint32_t grid_dim = hipGridDim_x * hipBlockDim_x;
 
-        size_t idx = blockIdx.x * block_dim + threadIdx.x;
+        size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x;
         while (idx < n) {
             __builtin_memcpy(
                 reinterpret_cast<void*>(&f[idx]),
diff --git a/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp b/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp
index c8c2e644c3..dac56bf709 100644
--- a/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp
+++ b/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp
@@ -34,7 +34,7 @@ THE SOFTWARE.
 #define NUM_STREAMS 2
 
 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     // Kernel loop designed to execute very slowly... ... ...   so we can test timing-related behavior below
     if(tx == 0){
         for(int i = 0; i<num;i++){
diff --git a/tests/src/deviceLib/hipComplex.cpp b/tests/src/deviceLib/hipComplex.cpp
index c01b430339..8a153b6bf0 100644
--- a/tests/src/deviceLib/hipComplex.cpp
+++ b/tests/src/deviceLib/hipComplex.cpp
@@ -27,7 +27,7 @@ THE SOFTWARE.
 #define SIZE 64<<2
 
 __global__  void getSqAbs(hipLaunchParm lp, float *A, float *B, float *C){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     C[tx] = hipCsqabsf(make_hipFloatComplex(A[tx], B[tx]));
 }
 
diff --git a/tests/src/deviceLib/hipDeviceMemcpy.cpp b/tests/src/deviceLib/hipDeviceMemcpy.cpp
index e845ae8f2f..23ccdee03b 100644
--- a/tests/src/deviceLib/hipDeviceMemcpy.cpp
+++ b/tests/src/deviceLib/hipDeviceMemcpy.cpp
@@ -16,13 +16,13 @@
 
 __global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
 {
-    int tx = threadIdx.x;
+    int tx = hipThreadIdx_x;
     memcpy(Out + tx, In + tx, sizeof(uint32_t));
 }
 
 __global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
 {
-    int tx = threadIdx.x;
+    int tx = hipThreadIdx_x;
     memset(ptr + tx, val, sizeof(uint32_t));
 }
 
diff --git a/tests/src/deviceLib/hipFloatMath.cpp b/tests/src/deviceLib/hipFloatMath.cpp
index 9c97e50aa6..7a96b5cd0d 100644
--- a/tests/src/deviceLib/hipFloatMath.cpp
+++ b/tests/src/deviceLib/hipFloatMath.cpp
@@ -35,7 +35,7 @@ THE SOFTWARE.
 
 
 __global__ void floatMath(hipLaunchParm lp, float *In, float *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   Out[tid] = __cosf(In[tid]);
   Out[tid] = __exp10f(Out[tid]);
   Out[tid] = __expf(Out[tid]);
diff --git a/tests/src/deviceLib/hipSimpleAtomicsTest.cpp b/tests/src/deviceLib/hipSimpleAtomicsTest.cpp
index 5485e4afc9..d12bd6fceb 100644
--- a/tests/src/deviceLib/hipSimpleAtomicsTest.cpp
+++ b/tests/src/deviceLib/hipSimpleAtomicsTest.cpp
@@ -217,7 +217,7 @@ int computeGold(int *gpuData, const int len)
 __global__ void testKernel(hipLaunchParm lp,int *g_odata)
 {
     // access thread id
-    const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    const unsigned int tid = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
 
     // Test various atomic instructions
 
diff --git a/tests/src/deviceLib/hipTestDevice.cpp b/tests/src/deviceLib/hipTestDevice.cpp
index 068866021b..fa85940839 100644
--- a/tests/src/deviceLib/hipTestDevice.cpp
+++ b/tests/src/deviceLib/hipTestDevice.cpp
@@ -32,82 +32,82 @@ THE SOFTWARE.
 #define SIZE N*sizeof(float)
 
 __global__ void test_sincosf(hipLaunchParm lp, float* a, float* b, float *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     sincosf(a[tid], b+tid, c+tid);
 }
 
 __global__ void test_sincospif(hipLaunchParm lp, float* a, float* b, float *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     sincospif(a[tid], b+tid, c+tid);
 }
 
 __global__ void test_fdividef(hipLaunchParm lp, float *a, float* b, float *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     c[tid] = fdividef(a[tid], b[tid]);
 }
 
 __global__ void test_llrintf(hipLaunchParm lp, float *a, long long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = llrintf(a[tid]);
 }
 
 __global__ void test_lrintf(hipLaunchParm lp, float *a, long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = lrintf(a[tid]);
 }
 
 __global__ void test_rintf(hipLaunchParm lp, float *a, float *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = rintf(a[tid]);
 }
 
 __global__ void test_llroundf(hipLaunchParm lp, float *a, long long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = llroundf(a[tid]);
 }
 
 __global__ void test_lroundf(hipLaunchParm lp, float *a, long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = lroundf(a[tid]);
 }
 
 __global__ void test_rhypotf(hipLaunchParm lp, float *a, float* b, float *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     c[tid] = rhypotf(a[tid], b[tid]);
 }
 
 __global__ void test_norm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     d[tid] = norm3df(a[tid], b[tid], c[tid]);
 }
 
 __global__ void test_norm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     e[tid] = norm4df(a[tid], b[tid], c[tid], d[tid]);
 }
 
 __global__ void test_normf(hipLaunchParm lp, float *a, float *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = normf(N, a);
 }
 
 __global__ void test_rnorm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     d[tid] = rnorm3df(a[tid], b[tid], c[tid]);
 }
 
 __global__ void test_rnorm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     e[tid] = rnorm4df(a[tid], b[tid], c[tid], d[tid]);
 }
 
 __global__ void test_rnormf(hipLaunchParm lp, float *a, float *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = rnormf(N, a);
 }
 
 __global__ void test_erfinvf(hipLaunchParm lp, float *a, float *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = erff(erfinvf(a[tid]));
 }
 
diff --git a/tests/src/deviceLib/hipTestDeviceDouble.cpp b/tests/src/deviceLib/hipTestDeviceDouble.cpp
index 0b5d18fe5a..3b919d0cab 100644
--- a/tests/src/deviceLib/hipTestDeviceDouble.cpp
+++ b/tests/src/deviceLib/hipTestDeviceDouble.cpp
@@ -32,72 +32,72 @@ THE SOFTWARE.
 #define SIZE N*sizeof(double)
 
 __global__ void test_sincos(hipLaunchParm lp, double* a, double* b, double *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     sincos(a[tid], b+tid, c+tid);
 }
 
 __global__ void test_sincospi(hipLaunchParm lp, double* a, double* b, double *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     sincospi(a[tid], b+tid, c+tid);
 }
 
 __global__ void test_llrint(hipLaunchParm lp, double *a, long long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = llrint(a[tid]);
 }
 
 __global__ void test_lrint(hipLaunchParm lp, double *a, long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = lrint(a[tid]);
 }
 
 __global__ void test_rint(hipLaunchParm lp, double *a, double *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = rint(a[tid]);
 }
 
 __global__ void test_llround(hipLaunchParm lp, double *a, long long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = llround(a[tid]);
 }
 
 __global__ void test_lround(hipLaunchParm lp, double *a, long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = lround(a[tid]);
 }
 
 __global__ void test_rhypot(hipLaunchParm lp, double *a, double* b, double *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     c[tid] = rhypot(a[tid], b[tid]);
 }
 
 __global__ void test_norm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     d[tid] = norm3d(a[tid], b[tid], c[tid]);
 }
 
 __global__ void test_norm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     e[tid] = norm4d(a[tid], b[tid], c[tid], d[tid]);
 }
 
 __global__ void test_rnorm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     d[tid] = rnorm3d(a[tid], b[tid], c[tid]);
 }
 
 __global__ void test_rnorm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     e[tid] = rnorm4d(a[tid], b[tid], c[tid], d[tid]);
 }
 
 __global__ void test_rnorm(hipLaunchParm lp, double *a, double *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = rnorm(N, a);
 }
 
 __global__ void test_erfinv(hipLaunchParm lp, double *a, double *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
     b[tid] = erf(erfinv(a[tid]));
 }
 
diff --git a/tests/src/deviceLib/hipTestDeviceSymbol.cpp b/tests/src/deviceLib/hipTestDeviceSymbol.cpp
index be064f643c..140f3d97dc 100644
--- a/tests/src/deviceLib/hipTestDeviceSymbol.cpp
+++ b/tests/src/deviceLib/hipTestDeviceSymbol.cpp
@@ -36,7 +36,7 @@ __device__ int globalOut[NUM];
 
 __global__ void Assign(hipLaunchParm lp, int* Out)
 {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     Out[tid] = globalIn[tid];
     globalOut[tid] = globalIn[tid];
 }
diff --git a/tests/src/deviceLib/hipTestHalf.cpp b/tests/src/deviceLib/hipTestHalf.cpp
index dd91e82440..f56538f080 100644
--- a/tests/src/deviceLib/hipTestHalf.cpp
+++ b/tests/src/deviceLib/hipTestHalf.cpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__
 
 __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
-  int tx = threadIdx.x;
+  int tx = hipThreadIdx_x;
   __half a = A[tx];
   __half b = B[tx];
   __half c = C[tx];
@@ -45,7 +45,7 @@ __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
 }
 
 __global__ void __half2Math(hipLaunchParm lp, __half2 *A, __half2 *B, __half2 *C) {
-  int tx = threadIdx.x;
+  int tx = hipThreadIdx_x;
   __half2 a = A[tx];
   __half2 b = B[tx];
   __half2 c = C[tx];
diff --git a/tests/src/deviceLib/hipThreadFence.cpp b/tests/src/deviceLib/hipThreadFence.cpp
index 2f73b68529..1bd9fbe02c 100644
--- a/tests/src/deviceLib/hipThreadFence.cpp
+++ b/tests/src/deviceLib/hipThreadFence.cpp
@@ -33,7 +33,7 @@ THE SOFTWARE.
 
 __global__ void vAdd(hipLaunchParm lp, float *In1, float *In2, float *In3, float *In4, float *Out)
 {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     In4[tid] = In1[tid] + In2[tid];
     __threadfence();
     In3[tid] = In3[tid] + In4[tid];
diff --git a/tests/src/deviceLib/hip_anyall.cpp b/tests/src/deviceLib/hip_anyall.cpp
index f0b314ce8d..c51c90f33f 100644
--- a/tests/src/deviceLib/hip_anyall.cpp
+++ b/tests/src/deviceLib/hip_anyall.cpp
@@ -37,9 +37,9 @@ __global__ void
 	warpvote(hipLaunchParm lp, int* device_any, int* device_all , int Num_Warps_per_Block, int pshift)
 {
 
-   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-   device_any[threadIdx.x>>pshift] = __any(tid -77);
-   device_all[threadIdx.x>>pshift] = __all(tid -77);
+   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+   device_any[hipThreadIdx_x>>pshift] = __any(tid -77);
+   device_all[hipThreadIdx_x>>pshift] = __all(tid -77);
 }
 
 int main(int argc, char *argv[])
diff --git a/tests/src/deviceLib/hip_ballot.cpp b/tests/src/deviceLib/hip_ballot.cpp
index 14b8f314a1..742c47a065 100644
--- a/tests/src/deviceLib/hip_ballot.cpp
+++ b/tests/src/deviceLib/hip_ballot.cpp
@@ -34,12 +34,12 @@ __global__ void
 	gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift)
 {
 
-   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-   const unsigned int warp_num = threadIdx.x >> pshift;
+   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+   const unsigned int warp_num = hipThreadIdx_x >> pshift;
 #ifdef __HIP_PLATFORM_HCC__
-   atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
+   atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
 #else
-	atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
+	atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
 #endif
 
 }
diff --git a/tests/src/deviceLib/hip_brev.cpp b/tests/src/deviceLib/hip_brev.cpp
index c08c39dec9..855a8bec47 100644
--- a/tests/src/deviceLib/hip_brev.cpp
+++ b/tests/src/deviceLib/hip_brev.cpp
@@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp,
              unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height)
   {
 
-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
       int i = y * width + x;
       if ( i < (width * height)) {
diff --git a/tests/src/deviceLib/hip_clz.cpp b/tests/src/deviceLib/hip_clz.cpp
index 53fd611184..bdb31f3e8d 100644
--- a/tests/src/deviceLib/hip_clz.cpp
+++ b/tests/src/deviceLib/hip_clz.cpp
@@ -83,8 +83,8 @@ HIP_kernel(hipLaunchParm lp,
     unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height)
   {
 
-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
       int i = y * width + x;
       if ( i < (width * height)) {
diff --git a/tests/src/deviceLib/hip_ffs.cpp b/tests/src/deviceLib/hip_ffs.cpp
index 49530bb298..c855ede060 100644
--- a/tests/src/deviceLib/hip_ffs.cpp
+++ b/tests/src/deviceLib/hip_ffs.cpp
@@ -73,8 +73,8 @@ HIP_kernel(hipLaunchParm lp,
 			    int width, int height)
   {
 
-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
       int i = y * width + x;
       if ( i < (width * height)) {
diff --git a/tests/src/deviceLib/hip_popc.cpp b/tests/src/deviceLib/hip_popc.cpp
index 19dafb4d43..e503e55b42 100644
--- a/tests/src/deviceLib/hip_popc.cpp
+++ b/tests/src/deviceLib/hip_popc.cpp
@@ -64,8 +64,8 @@ HIP_kernel(hipLaunchParm lp,
              unsigned int* a, unsigned int* b,unsigned  int* c, unsigned long long int* d, int width, int height)
   {
 
-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
       int i = y * width + x;
       if ( i < (width * height)) {
diff --git a/tests/src/deviceLib/hip_test_ldg.cpp b/tests/src/deviceLib/hip_test_ldg.cpp
index 4db522cc10..5540c4917d 100644
--- a/tests/src/deviceLib/hip_test_ldg.cpp
+++ b/tests/src/deviceLib/hip_test_ldg.cpp
@@ -57,8 +57,8 @@ vectoradd_float(hipLaunchParm lp,
              T* a, const T*  bm,  int width, int height)
 
   {
-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
       int i = y * width + x;
       if ( i < (width * height)) {
diff --git a/tests/src/deviceLib/hip_test_make_type.cpp b/tests/src/deviceLib/hip_test_make_type.cpp
index 6eba236e12..ce689ceb89 100644
--- a/tests/src/deviceLib/hip_test_make_type.cpp
+++ b/tests/src/deviceLib/hip_test_make_type.cpp
@@ -45,8 +45,8 @@ vectoradd_char1(hipLaunchParm lp,
              char1* a, const char1*  bm, const char1* cm, int width, int height)
 
   {
-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
       int i = y * width + x;
       if ( i < (width * height)) {
@@ -59,8 +59,8 @@ vectoradd_char2(hipLaunchParm lp,
              char2* a, const char2*  bm, const char2* cm, int width, int height)
 
   {
-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
       int i = y * width + x;
       if ( i < (width * height)) {
@@ -73,8 +73,8 @@ vectoradd_char3(hipLaunchParm lp,
              char3* a, const char3*  bm, const char3* cm, int width, int height)
 
   {
-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
       int i = y * width + x;
       if ( i < (width * height)) {
@@ -86,8 +86,8 @@ vectoradd_char4(hipLaunchParm lp,
              char4* a, const char4*  bm, const char4* cm, int width, int height)
 
   {
-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
       int i = y * width + x;
       if ( i < (width * height)) {
@@ -100,8 +100,8 @@ vectoradd_char4(hipLaunchParm lp,
 __kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) {
 
 
-  int x = blockDimX * blockIdx.x + threadIdx.x;
-  int y = blockDimY * blockIdy.y + threadIdx.y;
+  int x = blockDimX * hipBlockIdx_x + hipThreadIdx_x;
+  int y = blockDimY * blockIdy.y + hipThreadIdx_y;
 
   int i = y * width + x;
   if ( i < (width * height)) {
diff --git a/tests/src/deviceLib/hip_trig.cpp b/tests/src/deviceLib/hip_trig.cpp
index 6ee8dc58ad..5ec28101f3 100644
--- a/tests/src/deviceLib/hip_trig.cpp
+++ b/tests/src/deviceLib/hip_trig.cpp
@@ -36,7 +36,7 @@ THE SOFTWARE.
 #define SIZE LEN<<2
 
 __global__ void kernel_trig(hipLaunchParm lp, float *In, float *sin_d, float *cos_d, float *tan_d, float *sin_pd, float *cos_pd){
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   sin_d[tid] = __sinf(In[tid]);
   cos_d[tid] = __cosf(In[tid]);
   tan_d[tid] = __tanf(In[tid]);
diff --git a/tests/src/experimental/xcompile/hHip.c b/tests/src/experimental/xcompile/hHip.c
index 17e7e9ecf6..2ac4ebc73e 100644
--- a/tests/src/experimental/xcompile/hHip.c
+++ b/tests/src/experimental/xcompile/hHip.c
@@ -29,7 +29,7 @@ THE SOFTWARE.
 
 __global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd, size_t len)
 {
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     if(tx < len)
     {
         Cd[tx] = Ad[tx] + Bd[tx];
diff --git a/tests/src/experimental/xcompile/hipxxKer.cpp b/tests/src/experimental/xcompile/hipxxKer.cpp
index 5dca6c1bca..d1bbed63cd 100644
--- a/tests/src/experimental/xcompile/hipxxKer.cpp
+++ b/tests/src/experimental/xcompile/hipxxKer.cpp
@@ -30,7 +30,7 @@ THE SOFTWARE.
 
 __global__ void Kern(hipLaunchParm lp, float *A)
 {
-	int tx = threadIdx.x + blockIdx.x * blockDim.x;
+	int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
 	A[tx] += 1.0f;
 }
 
diff --git a/tests/src/experimental/xcompile/hxxHip.cpp b/tests/src/experimental/xcompile/hxxHip.cpp
index bca5d64afc..6a748d5c89 100644
--- a/tests/src/experimental/xcompile/hxxHip.cpp
+++ b/tests/src/experimental/xcompile/hxxHip.cpp
@@ -33,7 +33,7 @@ class memManager;
 template<typename T>
 __global__ void Add(hipLaunchParm lp, T* Ad, T* Bd, T* Cd, size_t Len)
 {
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     if(tx < Len)
     {
         Cd[tx] = Ad[tx] + Bd[tx];
diff --git a/tests/src/hipC.c b/tests/src/hipC.c
index efa03bb909..644df6c98f 100644
--- a/tests/src/hipC.c
+++ b/tests/src/hipC.c
@@ -34,7 +34,7 @@ THE SOFTWARE.
 #define SIZE 1024*1024*sizeof(int)
 
 __global__ void Iter(hipLaunchParm lp, int *Ad){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     if(tx == 0){
         for(int i=0;i<ITER;i++){
             Ad[tx] += 1;
diff --git a/tests/src/hipC.cpp b/tests/src/hipC.cpp
index 6eac543ecf..8abb877808 100644
--- a/tests/src/hipC.cpp
+++ b/tests/src/hipC.cpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #define SIZE 1024*1024*sizeof(int)
 
 __global__ void Iter(hipLaunchParm lp, int *Ad){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     if(tx == 0){
         for(int i=0;i<ITER;i++){
             Ad[tx] += 1;
diff --git a/tests/src/hipCKernel.c b/tests/src/hipCKernel.c
index 891165f831..7a72cf84ca 100644
--- a/tests/src/hipCKernel.c
+++ b/tests/src/hipCKernel.c
@@ -2,7 +2,7 @@
 #include "hip/hip_runtime_api.h"
 
 __global__ void Kernel(hipLaunchParm lp, float *Ad){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     Ad[tx] += Ad[tx-1];
 }
 
diff --git a/tests/src/kernel/hipDynamicShared.cpp b/tests/src/kernel/hipDynamicShared.cpp
index 4c16c52884..ba19fcaa0d 100644
--- a/tests/src/kernel/hipDynamicShared.cpp
+++ b/tests/src/kernel/hipDynamicShared.cpp
@@ -40,8 +40,8 @@ __global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T*
     T *sdata = reinterpret_cast<T *>(my_sdata);
 #endif
 
-    size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t tid = threadIdx.x;
+    size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t tid = hipThreadIdx_x;
 
     // initialize dynamic shared memory
     if (tid < groupElements) {
diff --git a/tests/src/kernel/hipDynamicShared2.cpp b/tests/src/kernel/hipDynamicShared2.cpp
index 4567ff6fc2..95e70a9956 100644
--- a/tests/src/kernel/hipDynamicShared2.cpp
+++ b/tests/src/kernel/hipDynamicShared2.cpp
@@ -34,7 +34,7 @@ THE SOFTWARE.
 
 __global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) {
   HIP_DYNAMIC_SHARED(float, sBd);
-  int tx = threadIdx.x;
+  int tx = hipThreadIdx_x;
   for(int i=0;i<LEN/64;i++) {
     sBd[tx + i * 64] = Ad[tx + i * 64] + 1.0f;
     Bd[tx + i * 64] = sBd[tx + i * 64];
diff --git a/tests/src/kernel/hipGridLaunch.cpp b/tests/src/kernel/hipGridLaunch.cpp
index dff6527e2c..992dad62f7 100644
--- a/tests/src/kernel/hipGridLaunch.cpp
+++ b/tests/src/kernel/hipGridLaunch.cpp
@@ -48,8 +48,8 @@ vectorADD2( hipLaunchParm lp,
             T *C_d,
             size_t N)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
 
     for (size_t i=offset; i<N; i+=stride) {
         double foo = __hiloint2double(A_d[i], B_d[i]);
diff --git a/tests/src/kernel/hipLanguageExtensions.cpp b/tests/src/kernel/hipLanguageExtensions.cpp
index 5c5397675c..709fe3f1b8 100644
--- a/tests/src/kernel/hipLanguageExtensions.cpp
+++ b/tests/src/kernel/hipLanguageExtensions.cpp
@@ -62,7 +62,7 @@ __global__ void MyKernel (const hipLaunchParm lp, const float *a, const float *b
 {
     //KERNELBEGIN;
 
-    unsigned gid = threadIdx.x;
+    unsigned gid = hipThreadIdx_x;
     if (gid < N) {
         c[gid] = a[gid] + PlusOne(b[gid]);
     }
@@ -96,7 +96,7 @@ vectorADD(const hipLaunchParm lp,
     int zuzu = deviceVar + 1;
 
 
-    int b = threadIdx.x;
+    int b = hipThreadIdx_x;
     int c;
 #ifdef NOT_YET
     int a = __shfl_up(x, 1);
@@ -119,8 +119,8 @@ vectorADD(const hipLaunchParm lp,
     __syncthreads();
 
 
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
 
     for (size_t i=offset; i<N; i+=stride) {
 		C_d[i] = A_d[i] + B_d[i];
diff --git a/tests/src/kernel/hipTestConstant.cpp b/tests/src/kernel/hipTestConstant.cpp
index 3922046a2e..e4d187b4d6 100644
--- a/tests/src/kernel/hipTestConstant.cpp
+++ b/tests/src/kernel/hipTestConstant.cpp
@@ -38,7 +38,7 @@ __constant__ int Value[LEN];
 
 __global__ void Get(hipLaunchParm lp, int *Ad)
 {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     Ad[tid] = Value[tid];
 }
 
diff --git a/tests/src/kernel/hipTestMallocKernel.cpp b/tests/src/kernel/hipTestMallocKernel.cpp
index 4aa0eb1cf7..9dd8b053a5 100644
--- a/tests/src/kernel/hipTestMallocKernel.cpp
+++ b/tests/src/kernel/hipTestMallocKernel.cpp
@@ -33,12 +33,12 @@ THE SOFTWARE.
 #define SIZE NUM * 8
 
 __global__ void Alloc(hipLaunchParm lp, uint64_t *Ptr) {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     Ptr[tid] = (uint64_t)malloc(128);
 }
 
 __global__ void Free(hipLaunchParm lp, uint64_t *Ptr) {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     free((void*)Ptr[tid]);
 }
 
diff --git a/tests/src/kernel/hipTestMemKernel.cpp b/tests/src/kernel/hipTestMemKernel.cpp
index c44f275eed..bac1905cd1 100644
--- a/tests/src/kernel/hipTestMemKernel.cpp
+++ b/tests/src/kernel/hipTestMemKernel.cpp
@@ -35,52 +35,52 @@ THE SOFTWARE.
 #define LEN12 12 * 4
 
 __global__ void MemCpy8(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   memcpy(Out + tid*8, In + tid*8, 8);
 }
 
 __global__ void MemCpy9(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   memcpy(Out + tid*9, In + tid*9, 9);
 }
 
 __global__ void MemCpy10(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   memcpy(Out + tid*10, In + tid*10, 10);
 }
 
 __global__ void MemCpy11(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   memcpy(Out + tid*11, In + tid*11, 11);
 }
 
 __global__ void MemCpy12(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   memcpy(Out + tid*12, In + tid*12, 12);
 }
 
 __global__ void MemSet8(hipLaunchParm lp, uint8_t *In) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   memset(In + tid*8, 1, 8);
 }
 
 __global__ void MemSet9(hipLaunchParm lp, uint8_t *In) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   memset(In + tid*9, 1, 9);
 }
 
 __global__ void MemSet10(hipLaunchParm lp, uint8_t *In) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   memset(In + tid*10, 1, 10);
 }
 
 __global__ void MemSet11(hipLaunchParm lp, uint8_t *In) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   memset(In + tid*11, 1, 11);
 }
 
 __global__ void MemSet12(hipLaunchParm lp, uint8_t *In) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
   memset(In + tid*12, 1, 12);
 }
 
diff --git a/tests/src/kernel/inline_asm_vadd.cpp b/tests/src/kernel/inline_asm_vadd.cpp
index 23406eefff..c191401172 100644
--- a/tests/src/kernel/inline_asm_vadd.cpp
+++ b/tests/src/kernel/inline_asm_vadd.cpp
@@ -37,7 +37,7 @@ __global__ void vadd_asm(hipLaunchParm lp,
                                 float *out,
                                 float *in)
 {
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
 
 #ifdef __HIP_PLATFORM_NVCC__
         asm volatile("add.f32 %0,%1,%2;":"=f"(out[i]):"f"(in[i]),"f"(out[i]));
diff --git a/tests/src/kernel/inline_asm_vmac.cpp b/tests/src/kernel/inline_asm_vmac.cpp
index a78da01e74..ba17633fcd 100644
--- a/tests/src/kernel/inline_asm_vmac.cpp
+++ b/tests/src/kernel/inline_asm_vmac.cpp
@@ -35,7 +35,7 @@ __global__ void vmac_asm(hipLaunchParm lp,
                                 float *out,
                                 float *in)
 {
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
 
     asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i]));
 }
diff --git a/tests/src/kernel/launch_bounds.cpp b/tests/src/kernel/launch_bounds.cpp
index 269b88e34d..3b1476fb11 100644
--- a/tests/src/kernel/launch_bounds.cpp
+++ b/tests/src/kernel/launch_bounds.cpp
@@ -33,7 +33,7 @@ void
 __launch_bounds__(256, 2)
 myKern(hipLaunchParm lp, int *C, const int *A, int N, int xfactor)
 {
-    int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+    int tid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
 
     if (tid < N) {
         C[tid] = A[tid];
diff --git a/tests/src/runtimeApi/device/hipDeviceSynchronize.cpp b/tests/src/runtimeApi/device/hipDeviceSynchronize.cpp
index c8c2e644c3..dac56bf709 100644
--- a/tests/src/runtimeApi/device/hipDeviceSynchronize.cpp
+++ b/tests/src/runtimeApi/device/hipDeviceSynchronize.cpp
@@ -34,7 +34,7 @@ THE SOFTWARE.
 #define NUM_STREAMS 2
 
 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     // Kernel loop designed to execute very slowly... ... ...   so we can test timing-related behavior below
     if(tx == 0){
         for(int i = 0; i<num;i++){
diff --git a/tests/src/runtimeApi/memory/hipHostGetFlags.cpp b/tests/src/runtimeApi/memory/hipHostGetFlags.cpp
index f9359fb5da..9fad60aec8 100644
--- a/tests/src/runtimeApi/memory/hipHostGetFlags.cpp
+++ b/tests/src/runtimeApi/memory/hipHostGetFlags.cpp
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #define SIZE LEN*sizeof(float)
 
 __global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd){
-int tx = threadIdx.x + blockIdx.x * blockDim.x;
+int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
 Cd[tx] = Ad[tx] + Bd[tx];
 }
 
diff --git a/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/tests/src/runtimeApi/memory/hipHostMalloc.cpp
index 4210ef0654..f086c818f0 100644
--- a/tests/src/runtimeApi/memory/hipHostMalloc.cpp
+++ b/tests/src/runtimeApi/memory/hipHostMalloc.cpp
@@ -33,13 +33,13 @@
 #define SIZE LEN*sizeof(float)
 
 __global__ void Add(float *Ad, float *Bd, float *Cd){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     Cd[tx] = Ad[tx] + Bd[tx];
 }
 
 
 __global__ void Set(int *Ad, int val){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     Ad[tx] = val;
 }
 
diff --git a/tests/src/runtimeApi/memory/hipHostRegister.cpp b/tests/src/runtimeApi/memory/hipHostRegister.cpp
index a65196ba89..001cb84c07 100644
--- a/tests/src/runtimeApi/memory/hipHostRegister.cpp
+++ b/tests/src/runtimeApi/memory/hipHostRegister.cpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include<malloc.h>
 
 __global__ void Inc(hipLaunchParm lp, float *Ad){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     Ad[tx] = Ad[tx] + float(1);
 }
 
diff --git a/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp b/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp
index 5cd46c808a..c4f4b23dc0 100644
--- a/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp
@@ -70,8 +70,8 @@ template<typename T>
 __global__ void
 addK (hipLaunchParm lp, T *A, T K, size_t numElements)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
 
     for (size_t i=offset; i<numElements; i+=stride) {
         A[i] = A[i] + K;
diff --git a/tests/src/runtimeApi/memory/hipMemoryAllocateCoherent.cpp b/tests/src/runtimeApi/memory/hipMemoryAllocateCoherent.cpp
index 667d4b404b..6042f538b3 100644
--- a/tests/src/runtimeApi/memory/hipMemoryAllocateCoherent.cpp
+++ b/tests/src/runtimeApi/memory/hipMemoryAllocateCoherent.cpp
@@ -31,7 +31,7 @@ THE SOFTWARE.
 
 __global__ void Kernel(hipLaunchParm lp,volatile float* hostRes)
 {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     hostRes[tid] = tid + 1;
     __threadfence_system();
     // expecting that the data is getting flushed to host here!
diff --git a/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp b/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp
index 81450f1fba..ee3384d813 100644
--- a/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp
+++ b/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp
@@ -63,8 +63,8 @@ int enablePeers(int dev0, int dev1)
 __global__ void
 memsetIntKernel(int * ptr, const int val, size_t numElements)
 {
-    int gid = (blockIdx.x * blockDim.x + threadIdx.x);
-    int stride = blockDim.x * gridDim.x ;
+    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    int stride = hipBlockDim_x * hipGridDim_x ;
     for (size_t i= gid; i< numElements; i+=stride){
        ptr[i] = val;
     }
@@ -73,8 +73,8 @@ memsetIntKernel(int * ptr, const int val, size_t numElements)
 __global__ void
 memcpyIntKernel(const int * src, int* dst, size_t numElements)
 {
-    int gid = (blockIdx.x * blockDim.x + threadIdx.x);
-    int stride = blockDim.x * gridDim.x ;
+    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    int stride = hipBlockDim_x * hipGridDim_x ;
     for (size_t i= gid; i< numElements; i+=stride){
        dst[i] = src[i];
     }
diff --git a/tests/src/runtimeApi/module/hipModule.cpp b/tests/src/runtimeApi/module/hipModule.cpp
index f2c2137738..1b7b62cff2 100644
--- a/tests/src/runtimeApi/module/hipModule.cpp
+++ b/tests/src/runtimeApi/module/hipModule.cpp
@@ -34,7 +34,7 @@ THE SOFTWARE.
 #define kernel_name "hello_world"
 
 __global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){
-  int tx = threadIdx.x;
+  int tx = hipThreadIdx_x;
   Bd[tx] = Ad[tx];
 }
 
diff --git a/tests/src/runtimeApi/module/vcpy_kernel.cpp b/tests/src/runtimeApi/module/vcpy_kernel.cpp
index 7ee1ad333b..0375eee342 100644
--- a/tests/src/runtimeApi/module/vcpy_kernel.cpp
+++ b/tests/src/runtimeApi/module/vcpy_kernel.cpp
@@ -24,7 +24,7 @@ THE SOFTWARE.
 
 extern "C" __global__ void hello_world(hipLaunchParm lp, float *a, float *b)
 {
-    int tx = threadIdx.x;
+    int tx = hipThreadIdx_x;
     b[tx] = a[tx];
 }
 
diff --git a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp
index 3727901645..780372f263 100644
--- a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp
+++ b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp
@@ -35,7 +35,7 @@ THE SOFTWARE.
 
 template<typename T>
 __global__ void Inc(hipLaunchParm lp, T *Array){
-int tx = threadIdx.x + blockIdx.x * blockDim.x;
+int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
 Array[tx] = Array[tx] + T(1);
 }
 
diff --git a/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp b/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp
index 66b93a164f..4e343121ed 100644
--- a/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp
+++ b/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 const int NN = 1 << 21;
 
 __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
-	int tid = threadIdx.x;
+	int tid = hipThreadIdx_x;
 	if(tid < 1){
 		for(int i=0;i<n;i++){
 			x[i] = sqrt(powf(3.14159,i));
@@ -39,7 +39,7 @@ __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
 }
 
 __global__ void nKernel(hipLaunchParm lp, float *y){
-	int tid = threadIdx.x;
+	int tid = hipThreadIdx_x;
 	y[tid] = y[tid] + 1.0f;
 }
 
diff --git a/tests/src/runtimeApi/stream/hipAPIStreamEnable.cpp b/tests/src/runtimeApi/stream/hipAPIStreamEnable.cpp
index e91b62231f..63cde4563e 100644
--- a/tests/src/runtimeApi/stream/hipAPIStreamEnable.cpp
+++ b/tests/src/runtimeApi/stream/hipAPIStreamEnable.cpp
@@ -31,7 +31,7 @@ THE SOFTWARE.
 const int NN = 1 << 21;
 
 __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
-	int tid = threadIdx.x;
+	int tid = hipThreadIdx_x;
 	if(tid < 1){
 		for(int i=0;i<n;i++){
 			x[i] = sqrt(powf(3.14159,i));
@@ -41,7 +41,7 @@ __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
 }
 
 __global__ void nKernel(hipLaunchParm lp, float *y){
-	int tid = threadIdx.x;
+	int tid = hipThreadIdx_x;
 	y[tid] = y[tid] + 1.0f;
 }
 
diff --git a/tests/src/runtimeApi/stream/hipNullStream.cpp b/tests/src/runtimeApi/stream/hipNullStream.cpp
index 7a4cd978a1..04a232f3bb 100644
--- a/tests/src/runtimeApi/stream/hipNullStream.cpp
+++ b/tests/src/runtimeApi/stream/hipNullStream.cpp
@@ -41,8 +41,8 @@ vectorADDRepeat(hipLaunchParm lp,
             size_t NELEM,
             int repeat)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
 
     for (int j=1; j<=repeat;j++) {
         for (size_t i=offset; i<NELEM; i+=stride) {
diff --git a/tests/src/runtimeApi/stream/hipStream.h b/tests/src/runtimeApi/stream/hipStream.h
index 0ce06bbc3f..583275cc91 100644
--- a/tests/src/runtimeApi/stream/hipStream.h
+++ b/tests/src/runtimeApi/stream/hipStream.h
@@ -73,7 +73,7 @@ void D2H(T *Dst, T *Src, size_t size){
 
 template<typename T>
 __global__ void Inc(hipLaunchParm lp, T *In){
-int tx = threadIdx.x + blockIdx.x * blockDim.x;
+int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
 In[tx] = In[tx] + 1;
 }
 
diff --git a/tests/src/runtimeApi/synchronization/copy_coherency.cpp b/tests/src/runtimeApi/synchronization/copy_coherency.cpp
index b2a66f61e2..e4bfb98206 100644
--- a/tests/src/runtimeApi/synchronization/copy_coherency.cpp
+++ b/tests/src/runtimeApi/synchronization/copy_coherency.cpp
@@ -102,8 +102,8 @@ MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel");
 __global__ void
 memsetIntKernel(int * ptr, const int val, size_t numElements)
 {
-    int gid = (blockIdx.x * blockDim.x + threadIdx.x);
-    int stride = blockDim.x * gridDim.x ;
+    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    int stride = hipBlockDim_x * hipGridDim_x ;
     for (size_t i= gid; i< numElements; i+=stride){
        ptr[i] = val;
     }
@@ -112,8 +112,8 @@ memsetIntKernel(int * ptr, const int val, size_t numElements)
 __global__ void
 memcpyIntKernel(int *dst, const int * src, size_t numElements)
 {
-    int gid = (blockIdx.x * blockDim.x + threadIdx.x);
-    int stride = blockDim.x * gridDim.x ;
+    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    int stride = hipBlockDim_x * hipGridDim_x ;
     for (size_t i= gid; i< numElements; i+=stride){
        dst[i] = src[i];
     }
diff --git a/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp b/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp
index 2916d51bf9..b34d331682 100644
--- a/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp
+++ b/tests/src/runtimeApi/synchronization/memcpyInt.device.cpp
@@ -5,8 +5,8 @@
 extern "C" __global__ void
 memcpyIntKernel(hipLaunchParm lp, int *dst, const int * src, size_t numElements)
 {
-    int gid = (blockIdx.x * blockDim.x + threadIdx.x);
-    int stride = blockDim.x * gridDim.x ;
+    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    int stride = hipBlockDim_x * hipGridDim_x ;
     for (size_t i= gid; i< numElements; i+=stride){
        dst[i] = src[i];
     }
diff --git a/tests/src/specialFunc.cu b/tests/src/specialFunc.cu
index 085be062d9..744dcd8926 100644
--- a/tests/src/specialFunc.cu
+++ b/tests/src/specialFunc.cu
@@ -23,7 +23,7 @@ THE SOFTWARE.
 void __global__
 test_kernel(float *A) 
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 
     float a = __ballot(tid < 16);
     float b = __shfl(tid < 16);
diff --git a/tests/src/stress/hipStressAsync.cpp b/tests/src/stress/hipStressAsync.cpp
index a142b41730..1f8cab1a36 100644
--- a/tests/src/stress/hipStressAsync.cpp
+++ b/tests/src/stress/hipStressAsync.cpp
@@ -30,7 +30,7 @@ THE SOFTWARE.
 #define ITER 1<<10
 
 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     if(tx == 0){
         for(int i = 0; i<num;i++){
             Ad[tx] += 1;
diff --git a/tests/src/stress/hipStressChain.cpp b/tests/src/stress/hipStressChain.cpp
index 55582d85e5..97183a97fa 100644
--- a/tests/src/stress/hipStressChain.cpp
+++ b/tests/src/stress/hipStressChain.cpp
@@ -26,7 +26,7 @@ THE SOFTWARE.
 static size_t size[NUM_SIZE];
 
 __global__ void Add(hipLaunchParm lp, int *Ad){
-    int tx = threadIdx.x;
+    int tx = hipThreadIdx_x;
     Ad[tx] = Ad[tx] + tx;
 }
 
diff --git a/tests/src/stress/hipStressKernel.cpp b/tests/src/stress/hipStressKernel.cpp
index 90cc10894f..52d8fa1fe9 100644
--- a/tests/src/stress/hipStressKernel.cpp
+++ b/tests/src/stress/hipStressKernel.cpp
@@ -26,7 +26,7 @@ THE SOFTWARE.
 static size_t size[NUM_SIZE];
 
 __global__ void Add(hipLaunchParm lp, int *Ad){
-    int tx = threadIdx.x;
+    int tx = hipThreadIdx_x;
     Ad[tx] = Ad[tx] + tx;
 }
 
diff --git a/tests/src/stress/hipStressSync.cpp b/tests/src/stress/hipStressSync.cpp
index ef6a340937..8f0252bb61 100644
--- a/tests/src/stress/hipStressSync.cpp
+++ b/tests/src/stress/hipStressSync.cpp
@@ -30,7 +30,7 @@ THE SOFTWARE.
 #define ITER 1<<10
 
 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
     if(tx == 0){
         for(int i = 0; i<num;i++){
             Ad[tx] += 1;
diff --git a/tests/src/test_common.h b/tests/src/test_common.h
index 1b9744647b..b176da29d4 100644
--- a/tests/src/test_common.h
+++ b/tests/src/test_common.h
@@ -136,8 +136,8 @@ vectorADD(hipLaunchParm lp,
             T *C_d,
             size_t NELEM)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
 
     for (size_t i=offset; i<NELEM; i+=stride) {
         C_d[i] = A_d[i] + B_d[i];
@@ -153,8 +153,8 @@ vectorADDReverse(hipLaunchParm lp,
             T *C_d,
             size_t NELEM)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
 
     for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
         C_d[i] = A_d[i] + B_d[i];
@@ -169,8 +169,8 @@ addCount( const T *A_d,
         size_t NELEM,
         int count)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
 
     // Deliberately do this in an inefficient way to increase kernel runtime
     for (int i=0; i<count; i++) {
@@ -188,8 +188,8 @@ addCountReverse( const T *A_d,
         int64_t NELEM,
         int count)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
 
     // Deliberately do this in an inefficient way to increase kernel runtime
     for (int i=0; i<count; i++) {
@@ -205,8 +205,8 @@ __global__ void
 memsetReverse( T *C_d,  T val,
         int64_t NELEM)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
 
     for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
         C_d[i] = val;
diff --git a/tests/src/texture/hipTextureObj2D.cpp b/tests/src/texture/hipTextureObj2D.cpp
index 9ddafd6b1c..443d708418 100644
--- a/tests/src/texture/hipTextureObj2D.cpp
+++ b/tests/src/texture/hipTextureObj2D.cpp
@@ -17,8 +17,8 @@ __global__ void tex2DKernel(float* outputData,
                              int width,
                              int height)
 {
-    int x = blockIdx.x*blockDim.x + threadIdx.x;
-    int y = blockIdx.y*blockDim.y + threadIdx.y;
+    int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
+    int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
     outputData[y*width + x] = tex2D<float>(textureObject, x, y);
 }
 
diff --git a/tests/src/texture/hipTextureRef2D.cpp b/tests/src/texture/hipTextureRef2D.cpp
index c42f09d5a0..ebc7a04385 100644
--- a/tests/src/texture/hipTextureRef2D.cpp
+++ b/tests/src/texture/hipTextureRef2D.cpp
@@ -20,8 +20,8 @@ __global__ void tex2DKernel(float* outputData,
                              int width,
                              int height)
 {
-    int x = blockIdx.x*blockDim.x + threadIdx.x;
-    int y = blockIdx.y*blockDim.y + threadIdx.y;
+    int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
+    int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
 #ifdef __HIP_PLATFORM_HCC__
     outputData[y*width + x] = tex2D(tex, textureObject, x, y);
 #else