This implements the trivial change needed to move back from the hip{Something}_{x, y, z} macros to the natural CUDA syntax of Something.{x, y, z}. This is contained in lines 384-404 in hip_runtime.h. All of the other changes have to do with changing unit tests to use this syntax. The macros are retained for backwards compatibility.

2017-11-19 01:54:12 +00:00
@@ -381,6 +381,27 @@ __device__ void  __threadfence_system(void) ;
 * @}
 */

+template<typename std::common_type<
+    decltype(hc_get_group_id),
+    decltype(hc_get_group_size),
+    decltype(hc_get_num_groups),
+    decltype(hc_get_workitem_id)>::type f>
+class Coordinates {
+    using R = decltype(f(0));
+
+    struct X { __device__ operator R() const { return f(0); } };
+    struct Y { __device__ operator R() const { return f(1); } };
+    struct Z { __device__ operator R() const { return f(2); } };
+public:
+    static constexpr X x{};
+    static constexpr Y y{};
+    static constexpr Z z{};
+};
+
+static constexpr Coordinates<hc_get_group_size> blockDim;
+static constexpr Coordinates<hc_get_group_id> blockIdx;
+static constexpr Coordinates<hc_get_num_groups> gridDim;
+static constexpr Coordinates<hc_get_workitem_id> threadIdx;

 #define hipThreadIdx_x (hc_get_workitem_id(0))
 #define hipThreadIdx_y (hc_get_workitem_id(1))
@@ -45,8 +45,8 @@ __device__ void *__hip_hc_malloc(size_t size)
    {
        return (void*)nullptr;
    }
-    uint32_t totalThreads = hipBlockDim_x * hipGridDim_x * hipBlockDim_y * hipGridDim_y * hipBlockDim_z * hipGridDim_z;
-    uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x;
+    uint32_t totalThreads = blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z;
+    uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x;

    uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads;
    uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads;
@@ -932,7 +932,7 @@ __device__  unsigned long long int atomicMax(unsigned long long int* address,
 template<typename T>
 __device__ T atomicCAS_impl(T* address, T compare, T val)
 {
-  // the implementation assumes the atomic is lock-free and 
+  // the implementation assumes the atomic is lock-free and
  // has the same size as the non-atmoic equivalent type
  static_assert(sizeof(T) == sizeof(std::atomic<T>)
                , "size mismatch between atomic and non-atomic types");
@@ -945,7 +945,7 @@ __device__ T atomicCAS_impl(T* address, T compare, T val)

  T expected = compare;

-  // hcc should generate a system scope atomic CAS 
+  // hcc should generate a system scope atomic CAS
  std::atomic_compare_exchange_weak_explicit(u.atomic_address
                                            , &expected, val
                                            , std::memory_order_acq_rel
@@ -1110,8 +1110,8 @@ __device__ void* __get_dynamicgroupbaseptr() {
  return hc::get_dynamic_group_segment_base_pointer();
 }

-__host__ void* __get_dynamicgroupbaseptr() { 
-  return nullptr; 
+__host__ void* __get_dynamicgroupbaseptr() {
+  return nullptr;
 }

 // Precise Math Functions
@@ -34,7 +34,7 @@ THE SOFTWARE.
 #define NUM_STREAMS 2

 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){
-    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    // Kernel loop designed to execute very slowly... ... ...   so we can test timing-related behavior below
    if(tx == 0){
        for(int i = 0; i<num;i++){
@@ -27,7 +27,7 @@ THE SOFTWARE.
 #define SIZE 64<<2

 __global__  void getSqAbs(hipLaunchParm lp, float *A, float *B, float *C){
-    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    C[tx] = hipCsqabsf(make_hipFloatComplex(A[tx], B[tx]));
 }

@@ -16,13 +16,13 @@

 __global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
 {
-    int tx = hipThreadIdx_x;
+    int tx = threadIdx.x;
    memcpy(Out + tx, In + tx, sizeof(uint32_t));
 }

 __global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
 {
-    int tx = hipThreadIdx_x;
+    int tx = threadIdx.x;
    memset(ptr + tx, val, (sizeof(uint32_t)*(size/LEN)));
 }

@@ -58,6 +58,6 @@ int main()
            return 0;
        }
    }
- 
+
    passed();
 }
@@ -35,7 +35,7 @@ THE SOFTWARE.


 __global__ void floatMath(hipLaunchParm lp, float *In, float *Out) {
-  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  Out[tid] = __cosf(In[tid]);
  Out[tid] = __exp10f(Out[tid]);
  Out[tid] = __expf(Out[tid]);
@@ -217,7 +217,7 @@ int computeGold(int *gpuData, const int len)
 __global__ void testKernel(hipLaunchParm lp,int *g_odata)
 {
    // access thread id
-    const unsigned int tid = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;

    // Test various atomic instructions

@@ -240,7 +240,7 @@ __global__ void testKernel(hipLaunchParm lp,int *g_odata)

    // Atomic increment (modulo 17+1)
    atomicInc((unsigned int *)&g_odata[5], 17);
- 
+
    // Atomic decrement
    atomicDec((unsigned int *)&g_odata[6], 137);

@@ -32,82 +32,82 @@ THE SOFTWARE.
 #define SIZE N*sizeof(float)

 __global__ void test_sincosf(hipLaunchParm lp, float* a, float* b, float *c){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    sincosf(a[tid], b+tid, c+tid);
 }

 __global__ void test_sincospif(hipLaunchParm lp, float* a, float* b, float *c){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    sincospif(a[tid], b+tid, c+tid);
 }

 __global__ void test_fdividef(hipLaunchParm lp, float *a, float* b, float *c){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    c[tid] = fdividef(a[tid], b[tid]);
 }

 __global__ void test_llrintf(hipLaunchParm lp, float *a, long long int *b){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    b[tid] = llrintf(a[tid]);
 }

 __global__ void test_lrintf(hipLaunchParm lp, float *a, long int *b){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    b[tid] = lrintf(a[tid]);
 }

 __global__ void test_rintf(hipLaunchParm lp, float *a, float *b){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    b[tid] = rintf(a[tid]);
 }

 __global__ void test_llroundf(hipLaunchParm lp, float *a, long long int *b){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    b[tid] = llroundf(a[tid]);
 }

 __global__ void test_lroundf(hipLaunchParm lp, float *a, long int *b){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    b[tid] = lroundf(a[tid]);
 }

 __global__ void test_rhypotf(hipLaunchParm lp, float *a, float* b, float *c){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    c[tid] = rhypotf(a[tid], b[tid]);
 }

 __global__ void test_norm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    d[tid] = norm3df(a[tid], b[tid], c[tid]);
 }

 __global__ void test_norm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    e[tid] = norm4df(a[tid], b[tid], c[tid], d[tid]);
 }

 __global__ void test_normf(hipLaunchParm lp, float *a, float *b){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    b[tid] = normf(N, a);
 }

 __global__ void test_rnorm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    d[tid] = rnorm3df(a[tid], b[tid], c[tid]);
 }

 __global__ void test_rnorm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    e[tid] = rnorm4df(a[tid], b[tid], c[tid], d[tid]);
 }

 __global__ void test_rnormf(hipLaunchParm lp, float *a, float *b){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    b[tid] = rnormf(N, a);
 }

 __global__ void test_erfinvf(hipLaunchParm lp, float *a, float *b){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    b[tid] = erff(erfinvf(a[tid]));
 }

@@ -32,72 +32,72 @@ THE SOFTWARE.
 #define SIZE N*sizeof(double)

 __global__ void test_sincos(hipLaunchParm lp, double* a, double* b, double *c){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    sincos(a[tid], b+tid, c+tid);
 }

 __global__ void test_sincospi(hipLaunchParm lp, double* a, double* b, double *c){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    sincospi(a[tid], b+tid, c+tid);
 }

 __global__ void test_llrint(hipLaunchParm lp, double *a, long long int *b){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    b[tid] = llrint(a[tid]);
 }

 __global__ void test_lrint(hipLaunchParm lp, double *a, long int *b){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    b[tid] = lrint(a[tid]);
 }

 __global__ void test_rint(hipLaunchParm lp, double *a, double *b){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    b[tid] = rint(a[tid]);
 }

 __global__ void test_llround(hipLaunchParm lp, double *a, long long int *b){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    b[tid] = llround(a[tid]);
 }

 __global__ void test_lround(hipLaunchParm lp, double *a, long int *b){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    b[tid] = lround(a[tid]);
 }

 __global__ void test_rhypot(hipLaunchParm lp, double *a, double* b, double *c){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    c[tid] = rhypot(a[tid], b[tid]);
 }

 __global__ void test_norm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    d[tid] = norm3d(a[tid], b[tid], c[tid]);
 }

 __global__ void test_norm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    e[tid] = norm4d(a[tid], b[tid], c[tid], d[tid]);
 }

 __global__ void test_rnorm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    d[tid] = rnorm3d(a[tid], b[tid], c[tid]);
 }

 __global__ void test_rnorm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    e[tid] = rnorm4d(a[tid], b[tid], c[tid], d[tid]);
 }

 __global__ void test_rnorm(hipLaunchParm lp, double *a, double *b){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    b[tid] = rnorm(N, a);
 }

 __global__ void test_erfinv(hipLaunchParm lp, double *a, double *b){
-    int tid = hipThreadIdx_x;
+    int tid = threadIdx.x;
    b[tid] = erf(erfinv(a[tid]));
 }

@@ -36,7 +36,7 @@ __device__ int globalOut[NUM];

 __global__ void Assign(hipLaunchParm lp, int* Out)
 {
-    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    Out[tid] = globalIn[tid];
    globalOut[tid] = globalIn[tid];
 }
@@ -28,7 +28,7 @@ THE SOFTWARE.
 #if __HIP_ARCH_GFX803__ > 0

 __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
-  int tx = hipThreadIdx_x;
+  int tx = threadIdx.x;
  __half a = A[tx];
  __half b = B[tx];
  __half c = C[tx];
@@ -44,7 +44,7 @@ __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
 }

 __global__ void __half2Math(hipLaunchParm lp, __half2 *A, __half2 *B, __half2 *C) {
-  int tx = hipThreadIdx_x;
+  int tx = threadIdx.x;
  __half2 a = A[tx];
  __half2 b = B[tx];
  __half2 c = C[tx];
@@ -33,7 +33,7 @@ THE SOFTWARE.

 __global__ void vAdd(hipLaunchParm lp, float *In1, float *In2, float *In3, float *In4, float *Out)
 {
-    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    In4[tid] = In1[tid] + In2[tid];
    __threadfence();
    In3[tid] = In3[tid] + In4[tid];
@@ -37,9 +37,9 @@ __global__ void
 	warpvote(hipLaunchParm lp, int* device_any, int* device_all , int Num_Warps_per_Block, int pshift)
 {

-   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-   device_any[hipThreadIdx_x>>pshift] = __any(tid -77);
-   device_all[hipThreadIdx_x>>pshift] = __all(tid -77);
+   int tid = threadIdx.x + blockIdx.x * blockDim.x;
+   device_any[threadIdx.x>>pshift] = __any(tid -77);
+   device_all[threadIdx.x>>pshift] = __all(tid -77);
 }

 int main(int argc, char *argv[])
@@ -49,7 +49,7 @@ int main(int argc, char *argv[])
  warpSize = devProp.warpSize;

  int w = warpSize;
-  pshift = 0; 
+  pshift = 0;
  while (w >>= 1) ++pshift;

  printf ("warpSize=%d pshift=%d\n", warpSize, pshift);
@@ -34,12 +34,12 @@ __global__ void
 	gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift)
 {

-   int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-   const unsigned int warp_num = hipThreadIdx_x >> pshift;
+   int tid = threadIdx.x + blockIdx.x * blockDim.x;
+   const unsigned int warp_num = threadIdx.x >> pshift;
 #ifdef __HIP_PLATFORM_HCC__
-   atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
+   atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
 #else
-	atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
+	atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
 #endif

 }
@@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp,
             unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height)
  {

-      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+      int x = blockDim.x * blockIdx.x + threadIdx.x;
+      int y = blockDim.y * blockIdx.y + threadIdx.y;

      int i = y * width + x;
      if ( i < (width * height)) {
@@ -83,8 +83,8 @@ HIP_kernel(hipLaunchParm lp,
    unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height)
  {

-      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+      int x = blockDim.x * blockIdx.x + threadIdx.x;
+      int y = blockDim.y * blockIdx.y + threadIdx.y;

      int i = y * width + x;
      if ( i < (width * height)) {
@@ -73,8 +73,8 @@ HIP_kernel(hipLaunchParm lp,
 			    int width, int height)
  {

-      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+      int x = blockDim.x * blockIdx.x + threadIdx.x;
+      int y = blockDim.y * blockIdx.y + threadIdx.y;

      int i = y * width + x;
      if ( i < (width * height)) {
@@ -64,8 +64,8 @@ HIP_kernel(hipLaunchParm lp,
             unsigned int* a, unsigned int* b,unsigned  int* c, unsigned long long int* d, int width, int height)
  {

-      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+      int x = blockDim.x * blockIdx.x + threadIdx.x;
+      int y = blockDim.y * blockIdx.y + threadIdx.y;

      int i = y * width + x;
      if ( i < (width * height)) {
@@ -57,8 +57,8 @@ vectoradd_float(hipLaunchParm lp,
             T* a, const T*  bm,  int width, int height)

  {
-      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+      int x = blockDim.x * blockIdx.x + threadIdx.x;
+      int y = blockDim.y * blockIdx.y + threadIdx.y;

      int i = y * width + x;
      if ( i < (width * height)) {
@@ -40,13 +40,13 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z  1


-__global__ void 
+__global__ void
 vectoradd_char1(hipLaunchParm lp,
-             char1* a, const char1*  bm, const char1* cm, int width, int height) 
+             char1* a, const char1*  bm, const char1* cm, int width, int height)

  {
-      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+      int x = blockDim.x * blockIdx.x + threadIdx.x;
+      int y = blockDim.y * blockIdx.y + threadIdx.y;

      int i = y * width + x;
      if ( i < (width * height)) {
@@ -54,40 +54,40 @@ vectoradd_char1(hipLaunchParm lp,
      }
  }

-__global__ void 
+__global__ void
 vectoradd_char2(hipLaunchParm lp,
-             char2* a, const char2*  bm, const char2* cm, int width, int height) 
+             char2* a, const char2*  bm, const char2* cm, int width, int height)

  {
-      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+      int x = blockDim.x * blockIdx.x + threadIdx.x;
+      int y = blockDim.y * blockIdx.y + threadIdx.y;

      int i = y * width + x;
      if ( i < (width * height)) {
        a[i] = make_char2(bm[i].x, bm[i].y) + make_char2(cm[i].x, cm[i].y);
      }
-} 
+}

-__global__ void 
+__global__ void
 vectoradd_char3(hipLaunchParm lp,
-             char3* a, const char3*  bm, const char3* cm, int width, int height) 
+             char3* a, const char3*  bm, const char3* cm, int width, int height)

  {
-      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+      int x = blockDim.x * blockIdx.x + threadIdx.x;
+      int y = blockDim.y * blockIdx.y + threadIdx.y;

      int i = y * width + x;
      if ( i < (width * height)) {
        a[i] = make_char3(bm[i].x, bm[i].y, bm[i].z) + make_char3(cm[i].x, cm[i].y, cm[i].z);
      }
 }
-__global__ void 
+__global__ void
 vectoradd_char4(hipLaunchParm lp,
-             char4* a, const char4*  bm, const char4* cm, int width, int height) 
+             char4* a, const char4*  bm, const char4* cm, int width, int height)

  {
-      int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-      int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+      int x = blockDim.x * blockIdx.x + threadIdx.x;
+      int y = blockDim.y * blockIdx.y + threadIdx.y;

      int i = y * width + x;
      if ( i < (width * height)) {
@@ -99,7 +99,7 @@ vectoradd_char4(hipLaunchParm lp,
 #if 0
 __kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) {

-  
+
  int x = blockDimX * blockIdx.x + threadIdx.x;
  int y = blockDimY * blockIdy.y + threadIdx.y;

@@ -128,21 +128,21 @@ bool dataTypesRun(){
  hostA = (T*)malloc(NUM * sizeof(T));
  hostB = (T*)malloc(NUM * sizeof(T));
  hostC = (T*)malloc(NUM * sizeof(T));
-  
+
  // initialize the input data
  for (i = 0; i < NUM; i++) {
    hostB[i] = (T)i;
    hostC[i] = (T)i;
  }
-  
+
  HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
  HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
  HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
-  
+
  HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
  HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));

-  hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), 
+  hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                  0, 0,
@@ -192,21 +192,21 @@ bool dataTypesRun(){
  hostA = (T*)malloc(NUM * sizeof(T));
  hostB = (T*)malloc(NUM * sizeof(T));
  hostC = (T*)malloc(NUM * sizeof(T));
-  
+
  // initialize the input data
  for (i = 0; i < NUM; i++) {
    hostB[i] = (T)i;
    hostC[i] = (T)i;
  }
-  
+
  HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
  HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
  HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
-  
+
  HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
  HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));

-  hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), 
+  hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                  0, 0,
@@ -256,21 +256,21 @@ bool dataTypesRun(){
  hostA = (T*)malloc(NUM * sizeof(T));
  hostB = (T*)malloc(NUM * sizeof(T));
  hostC = (T*)malloc(NUM * sizeof(T));
-  
+
  // initialize the input data
  for (i = 0; i < NUM; i++) {
    hostB[i] = (T)i;
    hostC[i] = (T)i;
  }
-  
+
  HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
  HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
  HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
-  
+
  HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
  HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));

-  hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), 
+  hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                  0, 0,
@@ -319,21 +319,21 @@ bool dataTypesRunChar4(){
  hostA = (T*)malloc(NUM * sizeof(T));
  hostB = (T*)malloc(NUM * sizeof(T));
  hostC = (T*)malloc(NUM * sizeof(T));
-  
+
  // initialize the input data
  for (i = 0; i < NUM; i++) {
    hostB[i] = (T)i;
    hostC[i] = (T)i;
  }
-  
+
  HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
  HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
  HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
-  
+
  HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
  HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));

-  hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), 
+  hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                  0, 0,
@@ -368,7 +368,7 @@ bool dataTypesRunChar4(){
 }

 int main() {
-  
+
  hipDeviceProp_t devProp;
  hipGetDeviceProperties(&devProp, 0);
  cout << " System minor " << devProp.minor << endl;
@@ -36,7 +36,7 @@ THE SOFTWARE.
 #define SIZE LEN<<2

 __global__ void kernel_trig(hipLaunchParm lp, float *In, float *sin_d, float *cos_d, float *tan_d, float *sin_pd, float *cos_pd){
-  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  sin_d[tid] = __sinf(In[tid]);
  cos_d[tid] = __cosf(In[tid]);
  tan_d[tid] = __tanf(In[tid]);
@@ -29,7 +29,7 @@ THE SOFTWARE.

 __global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd, size_t len)
 {
-    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    if(tx < len)
    {
        Cd[tx] = Ad[tx] + Bd[tx];
@@ -30,7 +30,7 @@ THE SOFTWARE.

 __global__ void Kern(hipLaunchParm lp, float *A)
 {
-	int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+	int tx = threadIdx.x + blockIdx.x * blockDim.x;
 	A[tx] += 1.0f;
 }

@@ -33,7 +33,7 @@ class memManager;
 template<typename T>
 __global__ void Add(hipLaunchParm lp, T* Ad, T* Bd, T* Cd, size_t Len)
 {
-    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    if(tx < Len)
    {
        Cd[tx] = Ad[tx] + Bd[tx];
@@ -34,7 +34,7 @@ THE SOFTWARE.
 #define SIZE 1024*1024*sizeof(int)

 __global__ void Iter(hipLaunchParm lp, int *Ad){
-    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    if(tx == 0){
        for(int i=0;i<ITER;i++){
            Ad[tx] += 1;
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #define SIZE 1024*1024*sizeof(int)

 __global__ void Iter(hipLaunchParm lp, int *Ad){
-    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    if(tx == 0){
        for(int i=0;i<ITER;i++){
            Ad[tx] += 1;
@@ -2,7 +2,7 @@
 #include "hip/hip_runtime_api.h"

 __global__ void Kernel(hipLaunchParm lp, float *Ad){
-    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    Ad[tx] += Ad[tx-1];
 }

@@ -40,8 +40,8 @@ __global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T*
    T *sdata = reinterpret_cast<T *>(my_sdata);
 #endif

-    size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t tid = hipThreadIdx_x;
+    size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t tid = threadIdx.x;

    // initialize dynamic shared memory
    if (tid < groupElements) {
@@ -34,7 +34,7 @@ THE SOFTWARE.

 __global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) {
  HIP_DYNAMIC_SHARED(float, sBd);
-  int tx = hipThreadIdx_x;
+  int tx = threadIdx.x;
  for(int i=0;i<LEN/64;i++) {
    sBd[tx + i * 64] = Ad[tx + i * 64] + 1.0f;
    Bd[tx + i * 64] = sBd[tx + i * 64];
@@ -48,11 +48,12 @@ vectorADD2( hipLaunchParm lp,
            T *C_d,
            size_t N)
 {
-    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x ;

    for (size_t i=offset; i<N; i+=stride) {
-        C_d[i] = A_d[i] + B_d[i] ;
+        auto foo = __hiloint2double(A_d[i], B_d[i]);
+        C_d[i] = __double2loint(foo) + __double2hiint(foo);//A_d[i] + B_d[i] ;
    }
 }

@@ -62,7 +62,7 @@ __global__ void MyKernel (const hipLaunchParm lp, const float *a, const float *b
 {
    //KERNELBEGIN;

-    unsigned gid = hipThreadIdx_x;
+    unsigned gid = threadIdx.x;
    if (gid < N) {
        c[gid] = a[gid] + PlusOne(b[gid]);
    }
@@ -96,7 +96,7 @@ vectorADD(const hipLaunchParm lp,
    int zuzu = deviceVar + 1;


-    int b = hipThreadIdx_x;
+    int b = threadIdx.x;
    int c;
 #ifdef NOT_YET
    int a = __shfl_up(x, 1);
@@ -119,8 +119,8 @@ vectorADD(const hipLaunchParm lp,
    __syncthreads();


-    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x ;

    for (size_t i=offset; i<N; i+=stride) {
 		C_d[i] = A_d[i] + B_d[i];
@@ -18,7 +18,7 @@ THE SOFTWARE.
 */

 /* HIT_START
- * BUILD: %t %s ../test_common.cpp 
+ * BUILD: %t %s ../test_common.cpp
 * RUN: %t
 * HIT_END
 */
@@ -38,7 +38,7 @@ __constant__ int Value[LEN];

 __global__ void Get(hipLaunchParm lp, int *Ad)
 {
-    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    Ad[tid] = Value[tid];
 }

@@ -33,12 +33,12 @@ THE SOFTWARE.
 #define SIZE NUM * 8

 __global__ void Alloc(hipLaunchParm lp, uint64_t *Ptr) {
-    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    Ptr[tid] = (uint64_t)malloc(128);
 }

 __global__ void Free(hipLaunchParm lp, uint64_t *Ptr) {
-    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    free((void*)Ptr[tid]);
 }

@@ -35,52 +35,52 @@ THE SOFTWARE.
 #define LEN12 12 * 4

 __global__ void MemCpy8(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  memcpy(Out + tid*8, In + tid*8, 8);
 }

 __global__ void MemCpy9(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  memcpy(Out + tid*9, In + tid*9, 9);
 }

 __global__ void MemCpy10(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  memcpy(Out + tid*10, In + tid*10, 10);
 }

 __global__ void MemCpy11(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  memcpy(Out + tid*11, In + tid*11, 11);
 }

 __global__ void MemCpy12(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
-  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  memcpy(Out + tid*12, In + tid*12, 12);
 }

 __global__ void MemSet8(hipLaunchParm lp, uint8_t *In) {
-  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  memset(In + tid*8, 1, 8);
 }

 __global__ void MemSet9(hipLaunchParm lp, uint8_t *In) {
-  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  memset(In + tid*9, 1, 9);
 }

 __global__ void MemSet10(hipLaunchParm lp, uint8_t *In) {
-  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  memset(In + tid*10, 1, 10);
 }

 __global__ void MemSet11(hipLaunchParm lp, uint8_t *In) {
-  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  memset(In + tid*11, 1, 11);
 }

 __global__ void MemSet12(hipLaunchParm lp, uint8_t *In) {
-  int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  memset(In + tid*12, 1, 12);
 }

@@ -37,7 +37,7 @@ __global__ void vadd_asm(hipLaunchParm lp,
                                float *out,
                                float *in)
 {
-    int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;

    asm volatile ("v_add_f32_e32 %0, %1, %2" : "=v" (out[i]) : "v"(in[i]),"v" (out[i]));
 }
@@ -49,7 +49,7 @@ void addCPUReference(
 {
    for(unsigned int j=0; j < NUM; j++)
    {
-        
+
        output[j]= input[j] + output[j];
    }
 }
@@ -35,7 +35,7 @@ __global__ void vmac_asm(hipLaunchParm lp,
                                float *out,
                                float *in)
 {
-    int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;

    asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i]));
 }
@@ -47,7 +47,7 @@ void CPUReference(
 {
    for(unsigned int j=0; j < NUM; j++)
    {
-        
+
        output[j]= a*input[j] + output[j];
    }
 }
@@ -33,7 +33,7 @@ void
 __launch_bounds__(256, 2)
 myKern(hipLaunchParm lp, int *C, const int *A, int N, int xfactor)
 {
-    int tid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    int tid = (blockIdx.x * blockDim.x + threadIdx.x);

    if (tid < N) {
        C[tid] = A[tid];
@@ -34,7 +34,7 @@ THE SOFTWARE.
 #define NUM_STREAMS 2

 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){
-    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    // Kernel loop designed to execute very slowly... ... ...   so we can test timing-related behavior below
    if(tx == 0){
        for(int i = 0; i<num;i++){
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #define SIZE LEN*sizeof(float)

 __global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd){
-int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+int tx = threadIdx.x + blockIdx.x * blockDim.x;
 Cd[tx] = Ad[tx] + Bd[tx];
 }

@@ -33,13 +33,13 @@
 #define SIZE LEN*sizeof(float)

 __global__ void Add(float *Ad, float *Bd, float *Cd){
-    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    Cd[tx] = Ad[tx] + Bd[tx];
 }


 __global__ void Set(int *Ad, int val){
-    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    Ad[tx] = val;
 }

@@ -52,13 +52,13 @@ std::vector<std::string> syncMsg = {"event", "stream", "device"};

 void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMethod, std::string msg)
 {
-    std::cerr << "test: CheckHostPointer " << msg 
+    std::cerr << "test: CheckHostPointer " << msg
              //<< " HIP_COHERENT_HOST_ALLOC=" << HIP_COHERENT_HOST_ALLOC
              //<< " HIP_EVENT_SYS_RELEASE=" <<   HIP_EVENT_SYS_RELEASE
-              << " eventFlags = " << std::hex << eventFlags 
-              << ((eventFlags & hipEventReleaseToDevice) ?  " hipEventReleaseToDevice" : "")  
-              << ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "") 
-              << " ptr=" << ptr 
+              << " eventFlags = " << std::hex << eventFlags
+              << ((eventFlags & hipEventReleaseToDevice) ?  " hipEventReleaseToDevice" : "")
+              << ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "")
+              << " ptr=" << ptr
              << " syncMethod=" << syncMsg[syncMethod] << "\n";

    hipStream_t s;
@@ -93,7 +93,7 @@ void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMe
        default:
            assert(0);
    };
-            
+
    for (int i=0; i<numElements; i++) {
        if (ptr[i] != expected) {
            printf ("mismatch at %d: %d != %d\n", i, ptr[i], expected);
@@ -153,7 +153,7 @@ int main(){
        size_t sizeBytes = numElements * sizeof (int);

 #ifdef __HIP_PLATFORM_HCC__
-        { 
+        {
            // Stimulate error condition:
            int *A = &numElements;
            HIPCHECK_API(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent|hipHostMallocNonCoherent), hipErrorInvalidValue);
@@ -174,7 +174,7 @@ int main(){
            // agent-scope releases don't provide host visibility, don't use them here:
        }

-        if (1) { 
+        if (1) {
            int *A = nullptr;
            HIPCHECK(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent));
            const char *ptrType = "coherent";
@@ -189,14 +189,14 @@ int main(){


        // Check defaults:
-        if (1) { 
+        if (1) {
            int *A = nullptr;
            HIPCHECK(hipHostMalloc((void**)&A, sizeBytes));
            const char *ptrType = "default";
            CheckHostPointer(numElements, A, 0, SYNC_DEVICE,   ptrType);
            CheckHostPointer(numElements, A, 0, SYNC_STREAM,  ptrType);
            CheckHostPointer(numElements, A, 0, SYNC_EVENT,   ptrType);
-            
+
            CheckHostPointer(numElements, A, 0, SYNC_DEVICE,   ptrType);
            CheckHostPointer(numElements, A, 0, SYNC_STREAM,  ptrType);
            CheckHostPointer(numElements, A, 0, SYNC_EVENT,   ptrType);
@@ -206,7 +206,7 @@ int main(){


    }
-        
+
    passed();

 }
@@ -29,13 +29,13 @@ THE SOFTWARE.
 #include<malloc.h>

 __global__ void Inc(hipLaunchParm lp, float *Ad){
-    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    Ad[tx] = Ad[tx] + float(1);
 }


 template<typename T>
-void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internalRegister) 
+void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internalRegister)
 {
    A = A + offset;
    numElements -= offset;
@@ -56,7 +56,7 @@ void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internal
    HIPCHECK(hipMemset(Bd, 13.0f, sizeBytes));


-    // 
+    //
    HIPCHECK(hipMemcpy(Bd, A, sizeBytes,  hipMemcpyHostToDevice));
    HIPCHECK(hipMemcpy(Bh, Bd, sizeBytes, hipMemcpyDeviceToHost));

@@ -81,7 +81,7 @@ int main(int argc, char *argv[])

    const size_t size = N * sizeof(float);

-    if (p_tests & 0x1) { 
+    if (p_tests & 0x1) {
        float *A, **Ad;
        int num_devices;
        HIPCHECK(hipGetDeviceCount(&num_devices));
@@ -115,7 +115,7 @@ int main(int argc, char *argv[])
    }


-    if (p_tests & 0x6) { 
+    if (p_tests & 0x6) {
        // Sensitize HIP bug if device does not match where the memory was registered.
        HIPCHECK(hipSetDevice(0));

@@ -129,7 +129,7 @@ int main(int argc, char *argv[])
        HIPCHECK(hipMalloc(&Bd, size));

        // TODO - set to 128
-#define OFFSETS_TO_TRY 128 
+#define OFFSETS_TO_TRY 128
        assert (N>OFFSETS_TO_TRY);

        if (p_tests & 0x2) {
@@ -59,7 +59,7 @@ struct HostTraits<Pinned>
    static const char *Name() { return "Pinned"; } ;

    static void *Alloc(size_t sizeBytes) {
-        void *p; 
+        void *p;
        HIPCHECK(hipHostMalloc((void**)&p, sizeBytes, hipHostMallocDefault));
        return p;
    };
@@ -67,11 +67,11 @@ struct HostTraits<Pinned>


 template<typename T>
-__global__ void 
+__global__ void
 addK (hipLaunchParm lp, T *A, T K, size_t numElements)
 {
-    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x ;

    for (size_t i=offset; i<numElements; i+=stride) {
        A[i] = A[i] + K;
@@ -85,7 +85,7 @@ addK (hipLaunchParm lp, T *A, T K, size_t numElements)
 //IN: numInflight : number of copies inflight at any time:
 //IN: numPongs = number of iterations to run (iteration)
 template<typename T, class AllocType>
-void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide) 
+void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide)
 {
    HIPASSERT(numElements % numInflight == 0); // Must be evenly divisible.
    size_t Nbytes = numElements*sizeof(T);
@@ -95,7 +95,7 @@ void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int
    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);

    printf ("-----------------------------------------------------------------------------------------------\n");
-    printf ("testing: %s<%s>  Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n", 
+    printf ("testing: %s<%s>  Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
            __func__, HostTraits<AllocType>::Name(), Nbytes, (double)(Nbytes)/1024.0/1024.0, numPongs, numInflight, eachCopyElements, eachCopyBytes);

    T *A_h = NULL;
@@ -176,7 +176,7 @@ void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies,
    size_t eachCopyBytes = eachCopyElements * sizeof(T);

    printf ("-----------------------------------------------------------------------------------------------\n");
-    printf ("testing: %s  Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n", 
+    printf ("testing: %s  Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
            __func__, Nbytes, (double)(Nbytes)/1024.0/1024.0, numCopies, eachCopyElements, eachCopyBytes);

    T *A_d;
@@ -194,7 +194,7 @@ void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies,
    //stream=0; // fixme TODO


-    for (int i=0; i<numCopies; i++) 
+    for (int i=0; i<numCopies; i++)
    {
        HIPASSERT(A_d + i*eachCopyElements < A_d + Nbytes);
        HIPCHECK(hipMemcpyAsync(&A_d[i*eachCopyElements], &A_h1[i*eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream));
@@ -204,7 +204,7 @@ void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies,
        HIPCHECK(hipDeviceSynchronize());
    }

-    for (int i=0; i<numCopies; i++) 
+    for (int i=0; i<numCopies; i++)
    {
        HIPASSERT(A_d + i*eachCopyElements < A_d + Nbytes);
        HIPCHECK(hipMemcpyAsync(&A_h2[i*eachCopyElements], &A_d[i*eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream));
@@ -252,7 +252,7 @@ void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemc


    hipStream_t *stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams);
-    if (useNullStream) { 
+    if (useNullStream) {
        nStreams = 1;
        stream[0] = NULL;
    } else  {
@@ -262,7 +262,7 @@ void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemc
    }


-    size_t workLeft = N; 
+    size_t workLeft = N;
    size_t workPerStream = N / nStreams;
    for (int i = 0; i < nStreams; ++i) {
        size_t work = (workLeft < workPerStream) ? workLeft : workPerStream;
@@ -287,7 +287,7 @@ void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemc
        } else {
            HIPCHECK ( hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost, stream[i]));
        }
-    } 
+    }


    HIPCHECK (hipDeviceSynchronize());
@@ -31,7 +31,7 @@ THE SOFTWARE.

 __global__ void Kernel(hipLaunchParm lp,volatile float* hostRes)
 {
-    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    hostRes[tid] = tid + 1;
    __threadfence_system();
    // expecting that the data is getting flushed to host here!
@@ -24,7 +24,7 @@ THE SOFTWARE.

 /* HIT_START
 * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
- * RUN: %t EXCLUDE_HIP_PLATFORM all 
+ * RUN: %t EXCLUDE_HIP_PLATFORM all
 * HIT_END
 */

@@ -63,8 +63,8 @@ int enablePeers(int dev0, int dev1)
 __global__ void
 memsetIntKernel(int * ptr, const int val, size_t numElements)
 {
-    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    int stride = hipBlockDim_x * hipGridDim_x ;
+    int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+    int stride = blockDim.x * gridDim.x ;
    for (size_t i= gid; i< numElements; i+=stride){
       ptr[i] = val;
    }
@@ -73,15 +73,15 @@ memsetIntKernel(int * ptr, const int val, size_t numElements)
 __global__ void
 memcpyIntKernel(const int * src, int* dst, size_t numElements)
 {
-    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    int stride = hipBlockDim_x * hipGridDim_x ;
+    int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+    int stride = blockDim.x * gridDim.x ;
    for (size_t i= gid; i< numElements; i+=stride){
       dst[i] = src[i];
    }
 };


-// CHeck arrays in reverse order, to more easily detect cases where 
+// CHeck arrays in reverse order, to more easily detect cases where
 // the copy is "partially" done.
 void checkReverse(const int *ptr, int numElements, int expected) {
    for (int i=numElements-1; i>=0; i--) {
@@ -34,7 +34,7 @@ THE SOFTWARE.
 #define kernel_name "hello_world"

 __global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){
-  int tx = hipThreadIdx_x;
+  int tx = threadIdx.x;
  Bd[tx] = Ad[tx];
 }

@@ -24,7 +24,7 @@ THE SOFTWARE.

 extern "C" __global__ void hello_world(hipLaunchParm lp, float *a, float *b)
 {
-    int tx = hipThreadIdx_x;
+    int tx = threadIdx.x;
    b[tx] = a[tx];
 }

@@ -35,7 +35,7 @@ THE SOFTWARE.

 template<typename T>
 __global__ void Inc(hipLaunchParm lp, T *Array){
-int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+int tx = threadIdx.x + blockIdx.x * blockDim.x;
 Array[tx] = Array[tx] + T(1);
 }

@@ -116,7 +116,7 @@ int main(int argc, char **argv)
 	}

 	const size_t size = N * sizeof(float);
-    
+
    for (int i=0; i< iterations; i++) {

 	std::thread t1(run1, size, stream[0]);
@@ -126,7 +126,7 @@ int main(int argc, char **argv)
 //	std::cout<<"T1"<<std::endl;
 	t2.join();
 //	std::cout<<"T2"<<std::endl;
-	t3.join(); 
+	t3.join();
    }
 	passed();
 }
@@ -29,7 +29,7 @@ THE SOFTWARE.
 const int NN = 1 << 21;

 __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
-	int tid = hipThreadIdx_x;
+	int tid = threadIdx.x;
 	if(tid < 1){
 		for(int i=0;i<n;i++){
 			x[i] = sqrt(powf(3.14159,i));
@@ -39,7 +39,7 @@ __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
 }

 __global__ void nKernel(hipLaunchParm lp, float *y){
-	int tid = hipThreadIdx_x;
+	int tid = threadIdx.x;
 	y[tid] = y[tid] + 1.0f;
 }

@@ -31,7 +31,7 @@ THE SOFTWARE.
 const int NN = 1 << 21;

 __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
-	int tid = hipThreadIdx_x;
+	int tid = threadIdx.x;
 	if(tid < 1){
 		for(int i=0;i<n;i++){
 			x[i] = sqrt(powf(3.14159,i));
@@ -41,7 +41,7 @@ __global__ void kernel(hipLaunchParm lp, float *x, float *y, int n){
 }

 __global__ void nKernel(hipLaunchParm lp, float *y){
-	int tid = hipThreadIdx_x;
+	int tid = threadIdx.x;
 	y[tid] = y[tid] + 1.0f;
 }

@@ -41,8 +41,8 @@ vectorADDRepeat(hipLaunchParm lp,
            size_t NELEM,
            int repeat)
 {
-    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x ;

    for (int j=1; j<=repeat;j++) {
        for (size_t i=offset; i<NELEM; i+=stride) {
@@ -73,7 +73,7 @@ void D2H(T *Dst, T *Src, size_t size){

 template<typename T>
 __global__ void Inc(hipLaunchParm lp, T *In){
-int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+int tx = threadIdx.x + blockIdx.x * blockDim.x;
 In[tx] = In[tx] + 1;
 }

@@ -94,12 +94,12 @@ void initArrays(T **Ad, T **Ah,
 }

 template<typename T>
-void initArrays(T **Ad, size_t N, 
-		bool deviceMemory = false, 
+void initArrays(T **Ad, size_t N,
+		bool deviceMemory = false,
 		bool usePinnedHost = false){
 	size_t NBytes = N * sizeof(T);
 	if(deviceMemory){
-		HIPCHECK( hipMalloc(Ad, NBytes)); 
+		HIPCHECK( hipMalloc(Ad, NBytes));
 	}else{
 		if(usePinnedHost){
 			HIPCHECK(hipHostMalloc((void**)Ad, NBytes, hipHostMallocDefault));
@@ -27,14 +27,14 @@ THE SOFTWARE.
 // TODO - add code object support here.
 /* HIT_START
 * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
- * RUN: %t 
+ * RUN: %t
 * HIT_END
 */


 // Test cache management (fences) and synchronization between kernel and copy commands.
-// Exhaustively tests 3 command types (copy, kernel, module kernel), 
-// many sync types (see SyncType), followed by another command, across a sweep 
+// Exhaustively tests 3 command types (copy, kernel, module kernel),
+// many sync types (see SyncType), followed by another command, across a sweep
 // of data sizes designed to stress various levels of the memory hierarchy.

 #include "hip/hip_runtime.h"
@@ -102,8 +102,8 @@ MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel");
 __global__ void
 memsetIntKernel(int * ptr, const int val, size_t numElements)
 {
-    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    int stride = hipBlockDim_x * hipGridDim_x ;
+    int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+    int stride = blockDim.x * gridDim.x ;
    for (size_t i= gid; i< numElements; i+=stride){
       ptr[i] = val;
    }
@@ -112,15 +112,15 @@ memsetIntKernel(int * ptr, const int val, size_t numElements)
 __global__ void
 memcpyIntKernel(int *dst, const int * src, size_t numElements)
 {
-    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    int stride = hipBlockDim_x * hipGridDim_x ;
+    int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+    int stride = blockDim.x * gridDim.x ;
    for (size_t i= gid; i< numElements; i+=stride){
       dst[i] = src[i];
    }
 };


-// CHeck arrays in reverse order, to more easily detect cases where 
+// CHeck arrays in reverse order, to more easily detect cases where
 // the copy is "partially" done.
 void checkReverse(const int *ptr, int numElements, int expected) {
    int mismatchCnt = 0;
@@ -157,7 +157,7 @@ const char * CmdTypeStr(CmdType c)
        ENUM_CASE_STR(MODULE_KERNEL);
        default: return "UNKNOWN";
    };
-} 
+}


 enum SyncType {
@@ -223,16 +223,16 @@ void resetInputs( int * Ad, int * Bd, int *Cd, int *Ch, size_t numElements, int

 // Intended to test proper synchronization and cache flushing between CMDA and CMDB.
 // CMD are of type CmdType. All command copy memory, using either hipMemcpyAsync or kernel implementations.
-// CmdA copies from Ad to Bd, 
+// CmdA copies from Ad to Bd,
 // Some form of synchronization is applied.
 // Then cmdB copies from Bd to Cd.
 //
 // Cd is then copied to host Ch using a memory copy.
 //
 // Correct result at the end is that Ch contains the contents originally in Ad (integer 0x42)
-void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType, 
+void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType,
            hipStream_t stream1, hipStream_t stream2, int numElements,
-             int * Ad, int * Bd, int *Cd, int *Ch, 
+             int * Ad, int * Bd, int *Cd, int *Ch,
             int expected)
 {
    hipEvent_t e;
@@ -241,14 +241,14 @@ void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType,
    resetInputs(Ad, Bd, Cd, Ch, numElements, expected);

    const size_t sizeElements = numElements * sizeof(int);
-    fprintf (stderr, "test: runTest with %zu bytes (%6.2f MB) cmdA=%s; sync=%s; cmdB=%s\n", 
+    fprintf (stderr, "test: runTest with %zu bytes (%6.2f MB) cmdA=%s; sync=%s; cmdB=%s\n",
            sizeElements, (double) (sizeElements/1024.0), CmdTypeStr(cmdAType), SyncTypeStr(syncType), CmdTypeStr(cmdBType));

    if (SKIP_MODULE_KERNEL && ((cmdAType == MODULE_KERNEL) || (cmdBType == MODULE_KERNEL))) {
        fprintf (stderr, "warn: skipping since test infra does not yet support modules\n");
        return;
    }
-            
+

    // Step A:
    runCmd(cmdAType, Bd, Ad, stream1, numElements);
@@ -334,7 +334,7 @@ void testWrapper(size_t numElements)
    fprintf (stderr, "test: init complete, start running tests\n");


-    runTestImpl(COPY, EVENT_SYNC, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected); 
+    runTestImpl(COPY, EVENT_SYNC, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);

    for (int cmdA=0; cmdA<MAX_CmdType; cmdA++) {
        for (int cmdB=0; cmdB<MAX_CmdType; cmdB++) {
@@ -347,7 +347,7 @@ void testWrapper(size_t numElements)
                    //case STREAM_QUERY:
                    case STREAM_SYNC:
                    case DEVICE_SYNC:
-                        runTestImpl(CmdType(cmdA), SyncType(syncMode), CmdType(cmdB), stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected); 
+                        runTestImpl(CmdType(cmdA), SyncType(syncMode), CmdType(cmdB), stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
                        break;
                    default:
                        break;
@@ -358,11 +358,11 @@ void testWrapper(size_t numElements)
    }

 #if 0
-    runTestImpl(COPY, STREAM_SYNC, MODULE_KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected); 
-    runTestImpl(COPY, STREAM_SYNC, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected); 
-    runTestImpl(COPY, STREAM_WAIT_EVENT, MODULE_KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected); 
+    runTestImpl(COPY, STREAM_SYNC, MODULE_KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
+    runTestImpl(COPY, STREAM_SYNC, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
+    runTestImpl(COPY, STREAM_WAIT_EVENT, MODULE_KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);

-    runTestImpl(COPY, STREAM_WAIT_EVENT, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected); 
+    runTestImpl(COPY, STREAM_WAIT_EVENT, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected);
 #endif

    HIPCHECK(hipFree(Ad));
@@ -1,12 +1,12 @@
 #include <hip/hip_runtime.h>


-    
+
 extern "C" __global__ void
 memcpyIntKernel(hipLaunchParm lp, int *dst, const int * src, size_t numElements)
 {
-    int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    int stride = hipBlockDim_x * hipGridDim_x ;
+    int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+    int stride = blockDim.x * gridDim.x ;
    for (size_t i= gid; i< numElements; i+=stride){
       dst[i] = src[i];
    }
@@ -30,7 +30,7 @@ THE SOFTWARE.
 #define ITER 1<<10

 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){
-    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    if(tx == 0){
        for(int i = 0; i<num;i++){
            Ad[tx] += 1;
@@ -58,7 +58,7 @@ int main(){
    }

    std::cout<<"Waitin..."<<std::endl;
-    
+
    hipDeviceSynchronize();
 }

@@ -26,7 +26,7 @@ THE SOFTWARE.
 static size_t size[NUM_SIZE];

 __global__ void Add(hipLaunchParm lp, int *Ad){
-    int tx = hipThreadIdx_x;
+    int tx = threadIdx.x;
    Ad[tx] = Ad[tx] + tx;
 }

@@ -26,7 +26,7 @@ THE SOFTWARE.
 static size_t size[NUM_SIZE];

 __global__ void Add(hipLaunchParm lp, int *Ad){
-    int tx = hipThreadIdx_x;
+    int tx = threadIdx.x;
    Ad[tx] = Ad[tx] + tx;
 }

@@ -30,7 +30,7 @@ THE SOFTWARE.
 #define ITER 1<<10

 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){
-    int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    if(tx == 0){
        for(int i = 0; i<num;i++){
            Ad[tx] += 1;
@@ -57,7 +57,7 @@ int main(){
    }

    std::cout<<"Waitin..."<<std::endl;
-    
+
    hipDeviceSynchronize();
 }

@@ -136,8 +136,8 @@ vectorADD(hipLaunchParm lp,
            T *C_d,
            size_t NELEM)
 {
-    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x ;

    for (size_t i=offset; i<NELEM; i+=stride) {
        C_d[i] = A_d[i] + B_d[i];
@@ -153,8 +153,8 @@ vectorADDReverse(hipLaunchParm lp,
            T *C_d,
            size_t NELEM)
 {
-    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x ;

    for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
        C_d[i] = A_d[i] + B_d[i];
@@ -169,8 +169,8 @@ addCount( const T *A_d,
        size_t NELEM,
        int count)
 {
-    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x ;

    // Deliberately do this in an inefficient way to increase kernel runtime
    for (int i=0; i<count; i++) {
@@ -188,8 +188,8 @@ addCountReverse( const T *A_d,
        int64_t NELEM,
        int count)
 {
-    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x ;

    // Deliberately do this in an inefficient way to increase kernel runtime
    for (int i=0; i<count; i++) {
@@ -205,8 +205,8 @@ __global__ void
 memsetReverse( T *C_d,  T val,
        int64_t NELEM)
 {
-    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x ;
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x ;

    for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
        C_d[i] = val;
@@ -219,12 +219,12 @@ void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h)
 {
    // Initialize the host data:
    for (size_t i=0; i<numElements; i++) {
-        if (A_h) 
+        if (A_h)
            (A_h)[i] = 3.146f + i; // Pi
-        if (B_h) 
+        if (B_h)
            (B_h)[i] = 1.618f + i; // Phi
-        if (C_h) 
-            (C_h)[i] = 0.0f + i;  
+        if (C_h)
+            (C_h)[i] = 0.0f + i;
    }
 }

@@ -268,8 +268,8 @@ void initArraysForHost(T **A_h, T **B_h, T **C_h,

 template <typename T>
 void initArrays(T **A_d, T **B_d, T **C_d,
-                T **A_h, T **B_h, T **C_h, 
-                size_t N, bool usePinnedHost=false) 
+                T **A_h, T **B_h, T **C_h,
+                size_t N, bool usePinnedHost=false)
 {
    size_t Nbytes = N*sizeof(T);

@@ -317,7 +317,7 @@ void freeArraysForHost(T *A_h, T *B_h, T *C_h, bool usePinnedHost)

 template <typename T>
 void freeArrays(T *A_d, T *B_d, T *C_d,
-                T *A_h, T *B_h, T *C_h, bool usePinnedHost) 
+                T *A_h, T *B_h, T *C_h, bool usePinnedHost)
 {
    if (A_d) {
        HIPCHECK ( hipFree(A_d) );
@@ -453,9 +453,9 @@ struct Pinned {
 	static const bool isPinned = true;
 	static const char *str() { return "Pinned"; };

-    static void *Alloc(size_t sizeBytes) 
+    static void *Alloc(size_t sizeBytes)
 	{
-        void *p; 
+        void *p;
        HIPCHECK(hipHostMalloc((void**)&p, sizeBytes));
        return p;
    };
@@ -463,12 +463,12 @@ struct Pinned {


 //---
-struct Unpinned 
+struct Unpinned
 {
 	static const bool isPinned = false;
 	static const char *str() { return "Unpinned"; };

-    static void *Alloc(size_t sizeBytes) 
+    static void *Alloc(size_t sizeBytes)
 	{
        void *p  = malloc (sizeBytes);
        HIPASSERT(p);
@@ -496,7 +496,7 @@ template<>
 struct MemTraits<Memcpy>
 {

-    static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind,  hipStream_t stream) 
+    static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind,  hipStream_t stream)
 	{
 		HIPCHECK(hipMemcpy(dest, src, sizeBytes, kind));
 	}
@@ -507,7 +507,7 @@ template<>
 struct MemTraits<MemcpyAsync>
 {

-    static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind,  hipStream_t stream) 
+    static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind,  hipStream_t stream)
 	{
 		HIPCHECK(hipMemcpyAsync(dest, src, sizeBytes, kind, stream));
 	}
@@ -17,8 +17,8 @@ __global__ void tex2DKernel(float* outputData,
                             int width,
                             int height)
 {
-    int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
-    int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
+    int x = blockIdx.x*blockDim.x + threadIdx.x;
+    int y = blockIdx.y*blockDim.y + threadIdx.y;
    outputData[y*width + x] = tex2D<float>(textureObject, x, y);
 }

@@ -18,8 +18,8 @@ __global__ void tex2DKernel(float* outputData,
                             int width,
                             int height)
 {
-    int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
-    int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y;
+    int x = blockIdx.x*blockDim.x + threadIdx.x;
+    int y = blockIdx.y*blockDim.y + threadIdx.y;
 #ifdef __HIP_PLATFORM_HCC__
    outputData[y*width + x] = tex2D(tex, textureObject, x, y);
 #else