From e6ded458987e53366259dcefcb0c29122b0ac450 Mon Sep 17 00:00:00 2001
From: ROCm CI Service Account <66695075+rocm-ci@users.noreply.github.com>
Date: Thu, 2 Jun 2022 17:07:52 +0530
Subject: [PATCH] SWDEV-339113 - Update sampl codes with correct kernel
 coordinate (#2704)

Change-Id: Ibfc0fd285441cd3d79b312d2b739729a039a6f84
---
 samples/0_Intro/bit_extract/bit_extract.cpp   |  4 +--
 samples/0_Intro/module_api/vcpy_kernel.cpp    |  2 +-
 .../0_Intro/module_api_global/vcpy_kernel.cpp |  4 +--
 samples/0_Intro/square/square.hipref.cpp      |  4 +--
 .../0_MatrixTranspose/MatrixTranspose.cpp     |  4 +--
 .../2_Cookbook/0_MatrixTranspose/Readme.md    | 12 +++----
 .../2_Cookbook/10_inline_asm/inline_asm.cpp   |  4 +--
 .../11_texture_driver/tex2dKernel.cpp         | 32 +++++++++----------
 .../MatrixTranspose.cpp                       |  4 +--
 samples/2_Cookbook/14_gpu_arch/gpuarch.cpp    |  2 +-
 .../6_dynamic_shared/dynamic_shared.cpp       |  4 +--
 samples/2_Cookbook/7_streams/stream.cpp       |  8 ++---
 samples/2_Cookbook/8_peer2peer/peer2peer.cpp  |  8 ++---
 samples/2_Cookbook/9_unroll/unroll.cpp        |  2 +-
 14 files changed, 47 insertions(+), 47 deletions(-)
diff --git a/samples/0_Intro/bit_extract/bit_extract.cpp b/samples/0_Intro/bit_extract/bit_extract.cpp
index c0d1f84486..cf0440dd57 100644
--- a/samples/0_Intro/bit_extract/bit_extract.cpp
+++ b/samples/0_Intro/bit_extract/bit_extract.cpp
@@ -35,8 +35,8 @@ THE SOFTWARE.
     }
 
 __global__ void bit_extract_kernel(uint32_t* C_d, const uint32_t* A_d, size_t N) {
-    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x;
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x;
 
     for (size_t i = offset; i < N; i += stride) {
 #ifdef __HIP_PLATFORM_AMD__
diff --git a/samples/0_Intro/module_api/vcpy_kernel.cpp b/samples/0_Intro/module_api/vcpy_kernel.cpp
index 4e1fa558f6..214a869b22 100644
--- a/samples/0_Intro/module_api/vcpy_kernel.cpp
+++ b/samples/0_Intro/module_api/vcpy_kernel.cpp
@@ -23,6 +23,6 @@ THE SOFTWARE.
 #include "hip/hip_runtime.h"
 
 extern "C" __global__ void hello_world(float* a, float* b) {
-    int tx = hipThreadIdx_x;
+    int tx = threadIdx.x;
     b[tx] = a[tx];
 }
diff --git a/samples/0_Intro/module_api_global/vcpy_kernel.cpp b/samples/0_Intro/module_api_global/vcpy_kernel.cpp
index e7886d0b8e..c0e820a4b7 100644
--- a/samples/0_Intro/module_api_global/vcpy_kernel.cpp
+++ b/samples/0_Intro/module_api_global/vcpy_kernel.cpp
@@ -28,11 +28,11 @@ __device__ float myDeviceGlobal;
 __device__ float myDeviceGlobalArray[16];
 
 extern "C" __global__ void hello_world(const float* a, float* b) {
-    int tx = hipThreadIdx_x;
+    int tx = threadIdx.x;
     b[tx] = a[tx];
 }
 
 extern "C" __global__ void test_globals(const float* a, float* b) {
-    int tx = hipThreadIdx_x;
+    int tx = threadIdx.x;
     b[tx] = a[tx] + myDeviceGlobal + myDeviceGlobalArray[tx % ARRAY_SIZE];
 }
diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp
index 7c2f794e8b..b8213e0074 100644
--- a/samples/0_Intro/square/square.hipref.cpp
+++ b/samples/0_Intro/square/square.hipref.cpp
@@ -38,8 +38,8 @@ THE SOFTWARE.
  */
 template <typename T>
 __global__ void vector_square(T* C_d, const T* A_d, size_t N) {
-    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
-    size_t stride = hipBlockDim_x * hipGridDim_x;
+    size_t offset = (blockIdx.x * blockDim_x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x;
 
     for (size_t i = offset; i < N; i += stride) {
         C_d[i] = A_d[i] * A_d[i];
diff --git a/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp
index 2457505908..8444cff851 100644
--- a/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp
+++ b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp
@@ -37,8 +37,8 @@ THE SOFTWARE.
 
 // Device (Kernel) function, it must be void
 __global__ void matrixTranspose(float* out, float* in, const int width) {
-    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    int x = blockDim.x * blockIdx.x + threadIdx.x;
+    int y = blockDim.y * blockIdx.y + threadIdx.y;
 
     out[y * width + x] = in[x * width + y];
 }
diff --git a/samples/2_Cookbook/0_MatrixTranspose/Readme.md b/samples/2_Cookbook/0_MatrixTranspose/Readme.md
index 432f9180dc..b53549ada7 100644
--- a/samples/2_Cookbook/0_MatrixTranspose/Readme.md
+++ b/samples/2_Cookbook/0_MatrixTranspose/Readme.md
@@ -27,8 +27,8 @@ __global__ void matrixTranspose(float *out,
                                 const int width,
                                 const int height)
 {
-    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    int x = blockDim.x * blockIdx.x + bhreadIdx.x;
+    int y = blockDim.y * blockIdx.y + bhreadIdx.y;
 
     out[y * width + x] = in[x * height + y];
 }
@@ -39,7 +39,7 @@ other function-type qualifiers are:
 `__device__` functions are Executed on the device and Called from the device only
 `__host__` functions are Executed on the host and Called from the host
 
-`__host__` can combine with `__device__`, in which case the function compiles for both the host and device. These functions cannot use the HIP grid coordinate functions (for example, "hipThreadIdx_x", will talk about it latter). A possible workaround is to pass the necessary coordinate info as an argument to the function.
+`__host__` can combine with `__device__`, in which case the function compiles for both the host and device. These functions cannot use the HIP grid coordinate functions (for example, "threadIdx.x", will talk about it latter). A possible workaround is to pass the necessary coordinate info as an argument to the function.
 `__host__` cannot combine with `__global__`.
 
 `__global__` functions are often referred to as *kernels*, and calling one is termed *launching the kernel*.
@@ -47,9 +47,9 @@ other function-type qualifiers are:
 Next keyword is `void`. HIP `__global__` functions must have a `void` return type. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.
 
 The kernel function begins with
-`    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;`
-`    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;`
-here the keyword hipBlockIdx_x, hipBlockIdx_y and hipBlockIdx_z(not used here) are the built-in functions to identify the threads in a block. The keyword hipBlockDim_x, hipBlockDim_y and hipBlockDim_z(not used here) are to identify the dimensions of the block.
+`    int x = blockDim.x * blockIdx.x + threadIdx.x;`
+`    int y = blockDim.y * blockIdx.y + threadIdx.y;`
+here the keyword blockIdx.x, blockIdx.y and blockIdx.z(not used here) are the built-in functions to identify the threads in a block. The keyword blockDim.x, blockDim.y and blockDim.z(not used here) are to identify the dimensions of the block.
 
 We are familiar with rest of the code on device-side.
 
diff --git a/samples/2_Cookbook/10_inline_asm/inline_asm.cpp b/samples/2_Cookbook/10_inline_asm/inline_asm.cpp
index 7c4b77da34..8145b3c86e 100644
--- a/samples/2_Cookbook/10_inline_asm/inline_asm.cpp
+++ b/samples/2_Cookbook/10_inline_asm/inline_asm.cpp
@@ -35,8 +35,8 @@ THE SOFTWARE.
 
 // Device (Kernel) function, it must be void
 __global__ void matrixTranspose(float* out, float* in, const int width) {
-    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    int x = blockDim.x * blockIdx.x + threadIdx.x;
+    int y = blockDim.y * blockIdx.y + threadIdx.y;
 
     asm volatile("v_mov_b32_e32 %0, %1" : "=v"(out[x * width + y]) : "v"(in[y * width + x]));
 }
diff --git a/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp b/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp
index 120f31c610..a1d3985de5 100644
--- a/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp
+++ b/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp
@@ -32,49 +32,49 @@ texture<int4, hipTextureType2D, hipReadModeElementType> texInt4;
 texture<float4, hipTextureType2D, hipReadModeElementType> texFloat4;
 
 extern "C" __global__ void tex2dKernelChar(char* outputData, int width, int height) {
-    int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-    int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
     outputData[y * width + x] = tex2D(texChar, x, y);
 }
 
 extern "C" __global__ void tex2dKernelShort(short* outputData, int width, int height) {
-    int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-    int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
     outputData[y * width + x] = tex2D(texShort, x, y);
 }
 
 extern "C" __global__ void tex2dKernelInt(int* outputData, int width, int height) {
-    int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-    int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
     outputData[y * width + x] = tex2D(texInt, x, y);
 }
 
 extern "C" __global__ void tex2dKernelFloat(float* outputData, int width, int height) {
-    int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-    int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
     outputData[y * width + x] = tex2D(texFloat, x, y);
 }
 
 extern "C" __global__ void tex2dKernelChar4(char4* outputData, int width, int height) {
-    int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-    int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
     outputData[y * width + x] = tex2D(texChar4, x, y);
 }
 
 extern "C" __global__ void tex2dKernelShort4(short4* outputData, int width, int height) {
-    int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-    int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
     outputData[y * width + x] = tex2D(texShort4, x, y);
 }
 
 extern "C" __global__ void tex2dKernelInt4(int4* outputData, int width, int height) {
-    int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-    int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
     outputData[y * width + x] = tex2D(texInt4, x, y);
 }
 
 extern "C" __global__ void tex2dKernelFloat4(float4* outputData, int width, int height) {
-    int x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
-    int y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
     outputData[y * width + x] = tex2D(texFloat4, x, y);
 }
diff --git a/samples/2_Cookbook/12_cmake_hip_add_executable/MatrixTranspose.cpp b/samples/2_Cookbook/12_cmake_hip_add_executable/MatrixTranspose.cpp
index 2457505908..8444cff851 100644
--- a/samples/2_Cookbook/12_cmake_hip_add_executable/MatrixTranspose.cpp
+++ b/samples/2_Cookbook/12_cmake_hip_add_executable/MatrixTranspose.cpp
@@ -37,8 +37,8 @@ THE SOFTWARE.
 
 // Device (Kernel) function, it must be void
 __global__ void matrixTranspose(float* out, float* in, const int width) {
-    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    int x = blockDim.x * blockIdx.x + threadIdx.x;
+    int y = blockDim.y * blockIdx.y + threadIdx.y;
 
     out[y * width + x] = in[x * width + y];
 }
diff --git a/samples/2_Cookbook/14_gpu_arch/gpuarch.cpp b/samples/2_Cookbook/14_gpu_arch/gpuarch.cpp
index b4c8487b67..f1b521fcd1 100644
--- a/samples/2_Cookbook/14_gpu_arch/gpuarch.cpp
+++ b/samples/2_Cookbook/14_gpu_arch/gpuarch.cpp
@@ -36,7 +36,7 @@ THE SOFTWARE.
 // 'out'
 //              but it will update with "NOT_SUPPORTED" for any other gfx archs.
 __global__ void incrementKernel(int32_t* in, int32_t* out, int32_t value, size_t buffSize) {
-  int index = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+  int index = blockDim.x * blockIdx.x + threadIdx.x;
   if (index < buffSize) {
 #if defined(__gfx908__)
     out[index] = in[index] + value;
diff --git a/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp b/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp
index 3e1c0f4b8c..531d94c5be 100644
--- a/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp
+++ b/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp
@@ -37,8 +37,8 @@ THE SOFTWARE.
 __global__ void matrixTranspose(float* out, float* in, const int width) {
     extern __shared__ float sharedMem[];
 
-    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    int x = blockDim.x * blockIdx.x + threadIdx.x;
+    int y = blockDim.y * blockIdx.y + threadIdx.y;
 
     sharedMem[y * width + x] = in[x * width + y];
 
diff --git a/samples/2_Cookbook/7_streams/stream.cpp b/samples/2_Cookbook/7_streams/stream.cpp
index b534b46b02..06da516444 100644
--- a/samples/2_Cookbook/7_streams/stream.cpp
+++ b/samples/2_Cookbook/7_streams/stream.cpp
@@ -37,8 +37,8 @@ __global__ void matrixTranspose_static_shared(float* out, float* in,
                                               const int width) {
     __shared__ float sharedMem[WIDTH * WIDTH];
 
-    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    int x = blockDim.x * blockIdx.x + threadIdx.x;
+    int y = blockDim.y * blockIdx.y + threadIdx.y;
 
     sharedMem[y * width + x] = in[x * width + y];
 
@@ -51,8 +51,8 @@ __global__ void matrixTranspose_dynamic_shared(float* out, float* in,
                                                const int width) {
     extern __shared__ float sharedMem[];
 
-    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    int x = blockDim.x * blockIdx.x + threadIdx.x;
+    int y = blockDim.y * blockIdx.y + threadIdx.y;
 
     sharedMem[y * width + x] = in[x * width + y];
 
diff --git a/samples/2_Cookbook/8_peer2peer/peer2peer.cpp b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp
index 4ec6372108..6b5c390b17 100644
--- a/samples/2_Cookbook/8_peer2peer/peer2peer.cpp
+++ b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp
@@ -110,8 +110,8 @@ __global__ void matrixTranspose_static_shared(float* out, float* in,
                                               const int width) {
     __shared__ float sharedMem[WIDTH * WIDTH];
 
-    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    int x = blockDim.x * blockIdx.x + threadIdx.x;
+    int y = blockDim.y * blockIdx.y + threadIdx.y;
 
     sharedMem[y * width + x] = in[x * width + y];
 
@@ -124,8 +124,8 @@ __global__ void matrixTranspose_dynamic_shared(float* out, float* in,
                                                const int width) {
     extern __shared__ float sharedMem[];
 
-    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+    int x = blockDim.x * blockIdx.x + threadIdx.x;
+    int y = blockDim.y * blockIdx.y + threadIdx.y;
 
     sharedMem[y * width + x] = in[x * width + y];
 
diff --git a/samples/2_Cookbook/9_unroll/unroll.cpp b/samples/2_Cookbook/9_unroll/unroll.cpp
index 8d659840d4..18f910a5dd 100644
--- a/samples/2_Cookbook/9_unroll/unroll.cpp
+++ b/samples/2_Cookbook/9_unroll/unroll.cpp
@@ -43,7 +43,7 @@ void matrixRowSum(int* input, int* output, int width) {
 
 // Device (kernel) function
 __global__ void gpuMatrixRowSum(int* input, int* output, int width) {
-    int index = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
 #pragma unroll
     for (int i = 0; i < width; i++) {
         output[index] += input[index * width + i];