Revert adoption of CUDA indexing in general - this can only work with later versions of the compiler, just like module based dispatch, and thus must be guarded against usage in earlier (e.g. 1.6) versions.

2017-11-29 21:01:28 +00:00
@@ -381,27 +381,29 @@ __device__ void  __threadfence_system(void) ;
 * @}
 */

-template<typename std::common_type<
-    decltype(hc_get_group_id),
-    decltype(hc_get_group_size),
-    decltype(hc_get_num_groups),
-    decltype(hc_get_workitem_id)>::type f>
-class Coordinates {
-    using R = decltype(f(0));
+#if __hcc_workweek__ >= 17481
+    template<typename std::common_type<
+        decltype(hc_get_group_id),
+        decltype(hc_get_group_size),
+        decltype(hc_get_num_groups),
+        decltype(hc_get_workitem_id)>::type f>
+    class Coordinates {
+        using R = decltype(f(0));

-    struct X { __device__ operator R() const { return f(0); } };
-    struct Y { __device__ operator R() const { return f(1); } };
-    struct Z { __device__ operator R() const { return f(2); } };
-public:
-    static constexpr X x{};
-    static constexpr Y y{};
-    static constexpr Z z{};
-};
+        struct X { __device__ operator R() const { return f(0); } };
+        struct Y { __device__ operator R() const { return f(1); } };
+        struct Z { __device__ operator R() const { return f(2); } };
+    public:
+        static constexpr X x{};
+        static constexpr Y y{};
+        static constexpr Z z{};
+    };

-static constexpr Coordinates<hc_get_group_size> blockDim;
-static constexpr Coordinates<hc_get_group_id> blockIdx;
-static constexpr Coordinates<hc_get_num_groups> gridDim;
-static constexpr Coordinates<hc_get_workitem_id> threadIdx;
+    static constexpr Coordinates<hc_get_group_size> blockDim;
+    static constexpr Coordinates<hc_get_group_id> blockIdx;
+    static constexpr Coordinates<hc_get_num_groups> gridDim;
+    static constexpr Coordinates<hc_get_workitem_id> threadIdx;
+#endif

 #define hipThreadIdx_x (hc_get_workitem_id(0))
 #define hipThreadIdx_y (hc_get_workitem_id(1))
@@ -40,8 +40,8 @@ template <typename T>
 __global__ void
 vector_square(T *C_d, const T *A_d, size_t N)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;

    for (size_t i=offset; i<N; i+=stride) {
        C_d[i] = A_d[i] * A_d[i];
@@ -45,8 +45,8 @@ __device__ void *__hip_hc_malloc(size_t size)
    {
        return (void*)nullptr;
    }
-    uint32_t totalThreads = blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z;
-    uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x;
+    uint32_t totalThreads = hipBlockDim_x * hipGridDim_x * hipBlockDim_y * hipGridDim_y * hipBlockDim_z * hipGridDim_z;
+    uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x;

    uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads;
    uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads;
@@ -1307,9 +1307,9 @@ namespace
    __global__
    void hip_fill_n(RandomAccessIterator f, N n, T value)
    {
-        const uint32_t grid_dim = gridDim.x * blockDim.x;
+        const uint32_t grid_dim = hipGridDim_x * hipBlockDim_x;

-        size_t idx = blockIdx.x * block_dim + threadIdx.x;
+        size_t idx = hipBlockIdx_x * block_dim + hipThreadIdx_x;
        while (idx < n) {
            __builtin_memcpy(
                reinterpret_cast<void*>(&f[idx]),
@@ -34,7 +34,7 @@ THE SOFTWARE.
 #define NUM_STREAMS 2

 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    // Kernel loop designed to execute very slowly... ... ...   so we can test timing-related behavior below
    if(tx == 0){
        for(int i = 0; i<num;i++){
@@ -27,7 +27,7 @@ THE SOFTWARE.
 #define SIZE 64<<2

 __global__  void getSqAbs(hipLaunchParm lp, float *A, float *B, float *C){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    C[tx] = hipCsqabsf(make_hipFloatComplex(A[tx], B[tx]));
 }

@@ -16,13 +16,13 @@

 __global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
 {
-    int tx = threadIdx.x;
+    int tx = hipThreadIdx_x;
    memcpy(Out + tx, In + tx, sizeof(uint32_t));
 }

 __global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
 {
-    int tx = threadIdx.x;
+    int tx = hipThreadIdx_x;
    memset(ptr + tx, val, sizeof(uint32_t));
 }

@@ -35,7 +35,7 @@ THE SOFTWARE.


 __global__ void floatMath(hipLaunchParm lp, float *In, float *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  Out[tid] = __cosf(In[tid]);
  Out[tid] = __exp10f(Out[tid]);
  Out[tid] = __expf(Out[tid]);
@@ -217,7 +217,7 @@ int computeGold(int *gpuData, const int len)
 __global__ void testKernel(hipLaunchParm lp,int *g_odata)
 {
    // access thread id
-    const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    const unsigned int tid = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;

    // Test various atomic instructions

@@ -32,82 +32,82 @@ THE SOFTWARE.
 #define SIZE N*sizeof(float)

 __global__ void test_sincosf(hipLaunchParm lp, float* a, float* b, float *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    sincosf(a[tid], b+tid, c+tid);
 }

 __global__ void test_sincospif(hipLaunchParm lp, float* a, float* b, float *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    sincospif(a[tid], b+tid, c+tid);
 }

 __global__ void test_fdividef(hipLaunchParm lp, float *a, float* b, float *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    c[tid] = fdividef(a[tid], b[tid]);
 }

 __global__ void test_llrintf(hipLaunchParm lp, float *a, long long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    b[tid] = llrintf(a[tid]);
 }

 __global__ void test_lrintf(hipLaunchParm lp, float *a, long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    b[tid] = lrintf(a[tid]);
 }

 __global__ void test_rintf(hipLaunchParm lp, float *a, float *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    b[tid] = rintf(a[tid]);
 }

 __global__ void test_llroundf(hipLaunchParm lp, float *a, long long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    b[tid] = llroundf(a[tid]);
 }

 __global__ void test_lroundf(hipLaunchParm lp, float *a, long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    b[tid] = lroundf(a[tid]);
 }

 __global__ void test_rhypotf(hipLaunchParm lp, float *a, float* b, float *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    c[tid] = rhypotf(a[tid], b[tid]);
 }

 __global__ void test_norm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    d[tid] = norm3df(a[tid], b[tid], c[tid]);
 }

 __global__ void test_norm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    e[tid] = norm4df(a[tid], b[tid], c[tid], d[tid]);
 }

 __global__ void test_normf(hipLaunchParm lp, float *a, float *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    b[tid] = normf(N, a);
 }

 __global__ void test_rnorm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    d[tid] = rnorm3df(a[tid], b[tid], c[tid]);
 }

 __global__ void test_rnorm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    e[tid] = rnorm4df(a[tid], b[tid], c[tid], d[tid]);
 }

 __global__ void test_rnormf(hipLaunchParm lp, float *a, float *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    b[tid] = rnormf(N, a);
 }

 __global__ void test_erfinvf(hipLaunchParm lp, float *a, float *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    b[tid] = erff(erfinvf(a[tid]));
 }

@@ -32,72 +32,72 @@ THE SOFTWARE.
 #define SIZE N*sizeof(double)

 __global__ void test_sincos(hipLaunchParm lp, double* a, double* b, double *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    sincos(a[tid], b+tid, c+tid);
 }

 __global__ void test_sincospi(hipLaunchParm lp, double* a, double* b, double *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    sincospi(a[tid], b+tid, c+tid);
 }

 __global__ void test_llrint(hipLaunchParm lp, double *a, long long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    b[tid] = llrint(a[tid]);
 }

 __global__ void test_lrint(hipLaunchParm lp, double *a, long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    b[tid] = lrint(a[tid]);
 }

 __global__ void test_rint(hipLaunchParm lp, double *a, double *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    b[tid] = rint(a[tid]);
 }

 __global__ void test_llround(hipLaunchParm lp, double *a, long long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    b[tid] = llround(a[tid]);
 }

 __global__ void test_lround(hipLaunchParm lp, double *a, long int *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    b[tid] = lround(a[tid]);
 }

 __global__ void test_rhypot(hipLaunchParm lp, double *a, double* b, double *c){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    c[tid] = rhypot(a[tid], b[tid]);
 }

 __global__ void test_norm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    d[tid] = norm3d(a[tid], b[tid], c[tid]);
 }

 __global__ void test_norm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    e[tid] = norm4d(a[tid], b[tid], c[tid], d[tid]);
 }

 __global__ void test_rnorm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    d[tid] = rnorm3d(a[tid], b[tid], c[tid]);
 }

 __global__ void test_rnorm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    e[tid] = rnorm4d(a[tid], b[tid], c[tid], d[tid]);
 }

 __global__ void test_rnorm(hipLaunchParm lp, double *a, double *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    b[tid] = rnorm(N, a);
 }

 __global__ void test_erfinv(hipLaunchParm lp, double *a, double *b){
-    int tid = threadIdx.x;
+    int tid = hipThreadIdx_x;
    b[tid] = erf(erfinv(a[tid]));
 }

@@ -36,7 +36,7 @@ __device__ int globalOut[NUM];

 __global__ void Assign(hipLaunchParm lp, int* Out)
 {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    Out[tid] = globalIn[tid];
    globalOut[tid] = globalIn[tid];
 }
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__

 __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
-  int tx = threadIdx.x;
+  int tx = hipThreadIdx_x;
  __half a = A[tx];
  __half b = B[tx];
  __half c = C[tx];
@@ -45,7 +45,7 @@ __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
 }

 __global__ void __half2Math(hipLaunchParm lp, __half2 *A, __half2 *B, __half2 *C) {
-  int tx = threadIdx.x;
+  int tx = hipThreadIdx_x;
  __half2 a = A[tx];
  __half2 b = B[tx];
  __half2 c = C[tx];
@@ -33,7 +33,7 @@ THE SOFTWARE.

 __global__ void vAdd(hipLaunchParm lp, float *In1, float *In2, float *In3, float *In4, float *Out)
 {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    In4[tid] = In1[tid] + In2[tid];
    __threadfence();
    In3[tid] = In3[tid] + In4[tid];
@@ -37,9 +37,9 @@ __global__ void
 	warpvote(hipLaunchParm lp, int* device_any, int* device_all , int Num_Warps_per_Block, int pshift)
 {

-   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-   device_any[threadIdx.x>>pshift] = __any(tid -77);
-   device_all[threadIdx.x>>pshift] = __all(tid -77);
+   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+   device_any[hipThreadIdx_x>>pshift] = __any(tid -77);
+   device_all[hipThreadIdx_x>>pshift] = __all(tid -77);
 }

 int main(int argc, char *argv[])
@@ -34,12 +34,12 @@ __global__ void
 	gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift)
 {

-   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-   const unsigned int warp_num = threadIdx.x >> pshift;
+   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+   const unsigned int warp_num = hipThreadIdx_x >> pshift;
 #ifdef __HIP_PLATFORM_HCC__
-   atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
+   atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
 #else
-	atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
+	atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
 #endif

 }
@@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp,
             unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height)
  {

-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

      int i = y * width + x;
      if ( i < (width * height)) {
@@ -83,8 +83,8 @@ HIP_kernel(hipLaunchParm lp,
    unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height)
  {

-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

      int i = y * width + x;
      if ( i < (width * height)) {
@@ -73,8 +73,8 @@ HIP_kernel(hipLaunchParm lp,
 			    int width, int height)
  {

-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

      int i = y * width + x;
      if ( i < (width * height)) {
@@ -64,8 +64,8 @@ HIP_kernel(hipLaunchParm lp,
             unsigned int* a, unsigned int* b,unsigned  int* c, unsigned long long int* d, int width, int height)
  {

-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

      int i = y * width + x;
      if ( i < (width * height)) {
@@ -57,8 +57,8 @@ vectoradd_float(hipLaunchParm lp,
             T* a, const T*  bm,  int width, int height)

  {
-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

      int i = y * width + x;
      if ( i < (width * height)) {
@@ -45,8 +45,8 @@ vectoradd_char1(hipLaunchParm lp,
             char1* a, const char1*  bm, const char1* cm, int width, int height)

  {
-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

      int i = y * width + x;
      if ( i < (width * height)) {
@@ -59,8 +59,8 @@ vectoradd_char2(hipLaunchParm lp,
             char2* a, const char2*  bm, const char2* cm, int width, int height)

  {
-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

      int i = y * width + x;
      if ( i < (width * height)) {
@@ -73,8 +73,8 @@ vectoradd_char3(hipLaunchParm lp,
             char3* a, const char3*  bm, const char3* cm, int width, int height)

  {
-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

      int i = y * width + x;
      if ( i < (width * height)) {
@@ -86,8 +86,8 @@ vectoradd_char4(hipLaunchParm lp,
             char4* a, const char4*  bm, const char4* cm, int width, int height)

  {
-      int x = blockDim.x * blockIdx.x + threadIdx.x;
-      int y = blockDim.y * blockIdx.y + threadIdx.y;
+      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

      int i = y * width + x;
      if ( i < (width * height)) {
@@ -100,8 +100,8 @@ vectoradd_char4(hipLaunchParm lp,
 __kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) {


-  int x = blockDimX * blockIdx.x + threadIdx.x;
-  int y = blockDimY * blockIdy.y + threadIdx.y;
+  int x = blockDimX * hipBlockIdx_x + hipThreadIdx_x;
+  int y = blockDimY * blockIdy.y + hipThreadIdx_y;

  int i = y * width + x;
  if ( i < (width * height)) {
@@ -36,7 +36,7 @@ THE SOFTWARE.
 #define SIZE LEN<<2

 __global__ void kernel_trig(hipLaunchParm lp, float *In, float *sin_d, float *cos_d, float *tan_d, float *sin_pd, float *cos_pd){
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  sin_d[tid] = __sinf(In[tid]);
  cos_d[tid] = __cosf(In[tid]);
  tan_d[tid] = __tanf(In[tid]);
@@ -29,7 +29,7 @@ THE SOFTWARE.

 __global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd, size_t len)
 {
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    if(tx < len)
    {
        Cd[tx] = Ad[tx] + Bd[tx];
@@ -30,7 +30,7 @@ THE SOFTWARE.

 __global__ void Kern(hipLaunchParm lp, float *A)
 {
-	int tx = threadIdx.x + blockIdx.x * blockDim.x;
+	int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
 	A[tx] += 1.0f;
 }

@@ -33,7 +33,7 @@ class memManager;
 template<typename T>
 __global__ void Add(hipLaunchParm lp, T* Ad, T* Bd, T* Cd, size_t Len)
 {
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    if(tx < Len)
    {
        Cd[tx] = Ad[tx] + Bd[tx];
@@ -34,7 +34,7 @@ THE SOFTWARE.
 #define SIZE 1024*1024*sizeof(int)

 __global__ void Iter(hipLaunchParm lp, int *Ad){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    if(tx == 0){
        for(int i=0;i<ITER;i++){
            Ad[tx] += 1;
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #define SIZE 1024*1024*sizeof(int)

 __global__ void Iter(hipLaunchParm lp, int *Ad){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    if(tx == 0){
        for(int i=0;i<ITER;i++){
            Ad[tx] += 1;
@@ -2,7 +2,7 @@
 #include "hip/hip_runtime_api.h"

 __global__ void Kernel(hipLaunchParm lp, float *Ad){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    Ad[tx] += Ad[tx-1];
 }

@@ -40,8 +40,8 @@ __global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T*
    T *sdata = reinterpret_cast<T *>(my_sdata);
 #endif

-    size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t tid = threadIdx.x;
+    size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t tid = hipThreadIdx_x;

    // initialize dynamic shared memory
    if (tid < groupElements) {
@@ -34,7 +34,7 @@ THE SOFTWARE.

 __global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) {
  HIP_DYNAMIC_SHARED(float, sBd);
-  int tx = threadIdx.x;
+  int tx = hipThreadIdx_x;
  for(int i=0;i<LEN/64;i++) {
    sBd[tx + i * 64] = Ad[tx + i * 64] + 1.0f;
    Bd[tx + i * 64] = sBd[tx + i * 64];
@@ -48,8 +48,8 @@ vectorADD2( hipLaunchParm lp,
            T *C_d,
            size_t N)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;

    for (size_t i=offset; i<N; i+=stride) {
        double foo = __hiloint2double(A_d[i], B_d[i]);
@@ -62,7 +62,7 @@ __global__ void MyKernel (const hipLaunchParm lp, const float *a, const float *b
 {
    //KERNELBEGIN;

-    unsigned gid = threadIdx.x;
+    unsigned gid = hipThreadIdx_x;
    if (gid < N) {
        c[gid] = a[gid] + PlusOne(b[gid]);
    }
@@ -96,7 +96,7 @@ vectorADD(const hipLaunchParm lp,
    int zuzu = deviceVar + 1;


-    int b = threadIdx.x;
+    int b = hipThreadIdx_x;
    int c;
 #ifdef NOT_YET
    int a = __shfl_up(x, 1);
@@ -119,8 +119,8 @@ vectorADD(const hipLaunchParm lp,
    __syncthreads();


-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;

    for (size_t i=offset; i<N; i+=stride) {
 		C_d[i] = A_d[i] + B_d[i];
@@ -38,7 +38,7 @@ __constant__ int Value[LEN];

 __global__ void Get(hipLaunchParm lp, int *Ad)
 {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    Ad[tid] = Value[tid];
 }

@@ -33,12 +33,12 @@ THE SOFTWARE.
 #define SIZE NUM * 8

 __global__ void Alloc(hipLaunchParm lp, uint64_t *Ptr) {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    Ptr[tid] = (uint64_t)malloc(128);
 }

 __global__ void Free(hipLaunchParm lp, uint64_t *Ptr) {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    free((void*)Ptr[tid]);
 }

@@ -35,52 +35,52 @@ THE SOFTWARE.
 #define LEN12 12 * 4

 __global__ void MemCpy8(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memcpy(Out + tid*8, In + tid*8, 8);
 }

 __global__ void MemCpy9(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memcpy(Out + tid*9, In + tid*9, 9);
 }

 __global__ void MemCpy10(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memcpy(Out + tid*10, In + tid*10, 10);
 }

 __global__ void MemCpy11(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memcpy(Out + tid*11, In + tid*11, 11);
 }

 __global__ void MemCpy12(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memcpy(Out + tid*12, In + tid*12, 12);
 }

 __global__ void MemSet8(hipLaunchParm lp, uint8_t *In) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memset(In + tid*8, 1, 8);
 }

 __global__ void MemSet9(hipLaunchParm lp, uint8_t *In) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memset(In + tid*9, 1, 9);
 }

 __global__ void MemSet10(hipLaunchParm lp, uint8_t *In) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memset(In + tid*10, 1, 10);
 }

 __global__ void MemSet11(hipLaunchParm lp, uint8_t *In) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memset(In + tid*11, 1, 11);
 }

 __global__ void MemSet12(hipLaunchParm lp, uint8_t *In) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  memset(In + tid*12, 1, 12);
 }

@@ -37,7 +37,7 @@ __global__ void vadd_asm(hipLaunchParm lp,
                                float *out,
                                float *in)
 {
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;

 #ifdef __HIP_PLATFORM_NVCC__
        asm volatile("add.f32 %0,%1,%2;":"=f"(out[i]):"f"(in[i]),"f"(out[i]));
@@ -35,7 +35,7 @@ __global__ void vmac_asm(hipLaunchParm lp,
                                float *out,
                                float *in)
 {
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;

    asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i]));
 }
@@ -33,7 +33,7 @@ void
 __launch_bounds__(256, 2)
 myKern(hipLaunchParm lp, int *C, const int *A, int N, int xfactor)
 {
-    int tid = (blockIdx.x * blockDim.x + threadIdx.x);
+    int tid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);

    if (tid < N) {
        C[tid] = A[tid];
@@ -34,7 +34,7 @@ THE SOFTWARE.
 #define NUM_STREAMS 2

 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    // Kernel loop designed to execute very slowly... ... ...   so we can test timing-related behavior below
    if(tx == 0){
        for(int i = 0; i<num;i++){
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #define SIZE LEN*sizeof(float)

 __global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd){
-int tx = threadIdx.x + blockIdx.x * blockDim.x;
+int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
 Cd[tx] = Ad[tx] + Bd[tx];
 }

@@ -33,13 +33,13 @@
 #define SIZE LEN*sizeof(float)

 __global__ void Add(float *Ad, float *Bd, float *Cd){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    Cd[tx] = Ad[tx] + Bd[tx];
 }


 __global__ void Set(int *Ad, int val){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    Ad[tx] = val;
 }

@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include<malloc.h>

 __global__ void Inc(hipLaunchParm lp, float *Ad){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    Ad[tx] = Ad[tx] + float(1);
 }

@@ -70,8 +70,8 @@ template<typename T>
 __global__ void
 addK (hipLaunchParm lp, T *A, T K, size_t numElements)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;

    for (size_t i=offset; i<numElements; i+=stride) {
        A[i] = A[i] + K;
@@ -31,7 +31,7 @@ THE SOFTWARE.

 __global__ void Kernel(hipLaunchParm lp,volatile float* hostRes)
 {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    hostRes[tid] = tid + 1;
    __threadfence_system();
    // expecting that the data is getting flushed to host here!
@@ -63,8 +63,8 @@ int enablePeers(int dev0, int dev1)
 __global__ void
 memsetIntKernel(int * ptr, const int val, size_t numElements)
 {
-    int gid = (blockIdx.x * blockDim.x + threadIdx.x);
-    int stride = blockDim.x * gridDim.x ;
+    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    int stride = hipBlockDim_x * hipGridDim_x ;
    for (size_t i= gid; i< numElements; i+=stride){
       ptr[i] = val;
    }
@@ -73,8 +73,8 @@ memsetIntKernel(int * ptr, const int val, size_t numElements)
 __global__ void
 memcpyIntKernel(const int * src, int* dst, size_t numElements)
 {
-    int gid = (blockIdx.x * blockDim.x + threadIdx.x);
-    int stride = blockDim.x * gridDim.x ;
+    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    int stride = hipBlockDim_x * hipGridDim_x ;
    for (size_t i= gid; i< numElements; i+=stride){
       dst[i] = src[i];
    }
@@ -34,7 +34,7 @@ THE SOFTWARE.
 #define kernel_name "hello_world"

 __global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){
-  int tx = threadIdx.x;
+  int tx = hipThreadIdx_x;
  Bd[tx] = Ad[tx];
 }

@@ -24,7 +24,7 @@ THE SOFTWARE.

 extern "C" __global__ void hello_world(hipLaunchParm lp, float *a, float *b)
 {
-    int tx = threadIdx.x;
+    int tx = hipThreadIdx_x;
    b[tx] = a[tx];
 }

@@ -35,7 +35,7 @@ THE SOFTWARE.

 template<typename T>
 __global__ void Inc(hipLaunchParm lp, T *Array){
-int tx = threadIdx.x + blockIdx.x * blockDim.x;
+int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
 Array[tx] = Array[tx] + T(1);
 }

@@ -29,7 +29,7 @@ THE SOFTWARE.
 const int NN = 1 << 21;

 __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
-	int tid = threadIdx.x;
+	int tid = hipThreadIdx_x;
 	if(tid < 1){
 		for(int i=0;i<n;i++){
 			x[i] = sqrt(powf(3.14159,i));
@@ -39,7 +39,7 @@ __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
 }

 __global__ void nKernel(hipLaunchParm lp, float *y){
-	int tid = threadIdx.x;
+	int tid = hipThreadIdx_x;
 	y[tid] = y[tid] + 1.0f;
 }

@@ -31,7 +31,7 @@ THE SOFTWARE.
 const int NN = 1 << 21;

 __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
-	int tid = threadIdx.x;
+	int tid = hipThreadIdx_x;
 	if(tid < 1){
 		for(int i=0;i<n;i++){
 			x[i] = sqrt(powf(3.14159,i));
@@ -41,7 +41,7 @@ __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
 }

 __global__ void nKernel(hipLaunchParm lp, float *y){
-	int tid = threadIdx.x;
+	int tid = hipThreadIdx_x;
 	y[tid] = y[tid] + 1.0f;
 }

@@ -41,8 +41,8 @@ vectorADDRepeat(hipLaunchParm lp,
            size_t NELEM,
            int repeat)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;

    for (int j=1; j<=repeat;j++) {
        for (size_t i=offset; i<NELEM; i+=stride) {
@@ -73,7 +73,7 @@ void D2H(T *Dst, T *Src, size_t size){

 template<typename T>
 __global__ void Inc(hipLaunchParm lp, T *In){
-int tx = threadIdx.x + blockIdx.x * blockDim.x;
+int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
 In[tx] = In[tx] + 1;
 }

@@ -102,8 +102,8 @@ MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel");
 __global__ void
 memsetIntKernel(int * ptr, const int val, size_t numElements)
 {
-    int gid = (blockIdx.x * blockDim.x + threadIdx.x);
-    int stride = blockDim.x * gridDim.x ;
+    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    int stride = hipBlockDim_x * hipGridDim_x ;
    for (size_t i= gid; i< numElements; i+=stride){
       ptr[i] = val;
    }
@@ -112,8 +112,8 @@ memsetIntKernel(int * ptr, const int val, size_t numElements)
 __global__ void
 memcpyIntKernel(int *dst, const int * src, size_t numElements)
 {
-    int gid = (blockIdx.x * blockDim.x + threadIdx.x);
-    int stride = blockDim.x * gridDim.x ;
+    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    int stride = hipBlockDim_x * hipGridDim_x ;
    for (size_t i= gid; i< numElements; i+=stride){
       dst[i] = src[i];
    }
@@ -5,8 +5,8 @@
 extern "C" __global__ void
 memcpyIntKernel(hipLaunchParm lp, int *dst, const int * src, size_t numElements)
 {
-    int gid = (blockIdx.x * blockDim.x + threadIdx.x);
-    int stride = blockDim.x * gridDim.x ;
+    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    int stride = hipBlockDim_x * hipGridDim_x ;
    for (size_t i= gid; i< numElements; i+=stride){
       dst[i] = src[i];
    }
@@ -23,7 +23,7 @@ THE SOFTWARE.
 void __global__
 test_kernel(float *A) 
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int tid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

    float a = __ballot(tid < 16);
    float b = __shfl(tid < 16);
@@ -30,7 +30,7 @@ THE SOFTWARE.
 #define ITER 1<<10

 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    if(tx == 0){
        for(int i = 0; i<num;i++){
            Ad[tx] += 1;
@@ -26,7 +26,7 @@ THE SOFTWARE.
 static size_t size[NUM_SIZE];

 __global__ void Add(hipLaunchParm lp, int *Ad){
-    int tx = threadIdx.x;
+    int tx = hipThreadIdx_x;
    Ad[tx] = Ad[tx] + tx;
 }

@@ -26,7 +26,7 @@ THE SOFTWARE.
 static size_t size[NUM_SIZE];

 __global__ void Add(hipLaunchParm lp, int *Ad){
-    int tx = threadIdx.x;
+    int tx = hipThreadIdx_x;
    Ad[tx] = Ad[tx] + tx;
 }

@@ -30,7 +30,7 @@ THE SOFTWARE.
 #define ITER 1<<10

 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){
-    int tx = threadIdx.x + blockIdx.x * blockDim.x;
+    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
    if(tx == 0){
        for(int i = 0; i<num;i++){
            Ad[tx] += 1;
@@ -136,8 +136,8 @@ vectorADD(hipLaunchParm lp,
            T *C_d,
            size_t NELEM)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;

    for (size_t i=offset; i<NELEM; i+=stride) {
        C_d[i] = A_d[i] + B_d[i];
@@ -153,8 +153,8 @@ vectorADDReverse(hipLaunchParm lp,
            T *C_d,
            size_t NELEM)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;

    for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
        C_d[i] = A_d[i] + B_d[i];
@@ -169,8 +169,8 @@ addCount( const T *A_d,
        size_t NELEM,
        int count)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;

    // Deliberately do this in an inefficient way to increase kernel runtime
    for (int i=0; i<count; i++) {
@@ -188,8 +188,8 @@ addCountReverse( const T *A_d,
        int64_t NELEM,
        int count)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;

    // Deliberately do this in an inefficient way to increase kernel runtime
    for (int i=0; i<count; i++) {
@@ -205,8 +205,8 @@ __global__ void
 memsetReverse( T *C_d,  T val,
        int64_t NELEM)
 {
-    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
-    size_t stride = blockDim.x * gridDim.x ;
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;

    for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
        C_d[i] = val;
@@ -17,8 +17,8 @@ __global__ void tex2DKernel(float* outputData,
                             int width,
                             int height)
 {
-    int x = blockIdx.x*blockDim.x + threadIdx.x;
-    int y = blockIdx.y*blockDim.y + threadIdx.y;
+    int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
+    int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
    outputData[y*width + x] = tex2D<float>(textureObject, x, y);
 }

@@ -20,8 +20,8 @@ __global__ void tex2DKernel(float* outputData,
                             int width,
                             int height)
 {
-    int x = blockIdx.x*blockDim.x + threadIdx.x;
-    int y = blockIdx.y*blockDim.y + threadIdx.y;
+    int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
+    int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
 #ifdef __HIP_PLATFORM_HCC__
    outputData[y*width + x] = tex2D(tex, textureObject, x, y);
 #else