Replace hipLaunchKernel -> hipLaunchKernelGGL

Change-Id: I4d99009e1199811d417becf1e1b934ec4d4e30be
2018-10-17 12:01:44 +05:30
parent 9e167ab02e
commit bfceb14751
23 changed files with 56 additions and 69 deletions
@@ -38,7 +38,7 @@ THE SOFTWARE.
        }                                                                                          \
    }

-__global__ void bit_extract_kernel(hipLaunchParm lp, uint32_t* C_d, const uint32_t* A_d, size_t N) {
+__global__ void bit_extract_kernel(uint32_t* C_d, const uint32_t* A_d, size_t N) {
    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
    size_t stride = hipBlockDim_x * hipGridDim_x;

@@ -85,7 +85,7 @@ int main(int argc, char* argv[]) {
    printf("info: launch 'bit_extract_kernel' \n");
    const unsigned blocks = 512;
    const unsigned threadsPerBlock = 256;
-    hipLaunchKernel(bit_extract_kernel, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
+    hipLaunchKernelGGL(bit_extract_kernel, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);

    printf("info: copy Device2Host\n");
    CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
@@ -22,7 +22,7 @@ THE SOFTWARE.

 #include "hip/hip_runtime.h"

-__global__ void vadd_hip(hipLaunchParm lp, const float* a, const float* b, float* c, int N) {
+__global__ void vadd_hip(const float* a, const float* b, float* c, int N) {
    int idx = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);

    if (idx < N) {
@@ -60,7 +60,7 @@ int main(int argc, char* argv[]) {
    // Launch kernel onto default accelerator
    int blockSize = 256;                                      // pick arbitrary block size
    int blocks = (sizeElements + blockSize - 1) / blockSize;  // round up to launch enough blocks
-    hipLaunchKernel(vadd_hip, dim3(blocks), dim3(blockSize), 0, 0, A_d, B_d, C_d, sizeElements);
+    hipLaunchKernelGGL(vadd_hip, dim3(blocks), dim3(blockSize), 0, 0, A_d, B_d, C_d, sizeElements);

    // D2H Copy
    hipMemcpy(C_h, C_d, sizeBytes, hipMemcpyDeviceToHost);
@@ -37,7 +37,7 @@ THE SOFTWARE.
 * Square each element in the array A and write to array C.
 */
 template <typename T>
-__global__ void vector_square(hipLaunchParm lp, T* C_d, const T* A_d, size_t N) {
+__global__ void vector_square(T* C_d, const T* A_d, size_t N) {
    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
    size_t stride = hipBlockDim_x * hipGridDim_x;

@@ -81,7 +81,7 @@ int main(int argc, char* argv[]) {
    const unsigned threadsPerBlock = 256;

    printf("info: launch 'vector_square' kernel\n");
-    hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
+    hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);

    printf("info: copy Device2Host\n");
    CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
@@ -434,7 +434,7 @@ class KernelCommand : public Command {

        switch (_kind) {
            case Null:
-                hipLaunchKernel(NullKernel, dim3(gridX / groupX), dim3(gridX), 0, _stream, nullptr);
+                hipLaunchKernelGGL(NullKernel, dim3(gridX / groupX), dim3(gridX), 0, _stream, nullptr);
                break;
            case VectorAdd:
                assert(0);  // TODO
@@ -1,6 +1,6 @@
 #include "hip/hip_runtime.h"

-extern "C" __global__ void NullKernel(hipLaunchParm lp, float* Ad) {
+extern "C" __global__ void NullKernel(float* Ad) {
    if (Ad) {
        Ad[0] = 42;
    }
@@ -3,7 +3,7 @@
 static const int BLOCKSIZEX = 32;
 static const int BLOCKSIZEY = 16;

-__global__ void fails(hipLaunchParm lp, float* pErrorI) {
+__global__ void fails(float* pErrorI) {
    if (pErrorI != 0) {
        pErrorI[0] = 1;
    }
@@ -14,5 +14,5 @@ int main() {
    dim3 threads(BLOCKSIZEX, BLOCKSIZEY);
    float error;

-    hipLaunchKernel(HIP_KERNEL_NAME(fails), blocks, threads, 0, 0, &error);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(fails), blocks, threads, 0, 0, &error);
 }
@@ -48,7 +48,7 @@ const unsigned p_tests = 0xfffffff;


 // HCC optimizes away fully NULL kernel calls, so run one that is nearly null:
-__global__ void NearlyNull(hipLaunchParm lp, float* Ad) {
+__global__ void NearlyNull(float* Ad) {
    if (Ad) {
        Ad[0] = 42;
    }
@@ -94,14 +94,14 @@ int main() {

    if (p_tests & 0x1) {
        hipEventRecord(start);
-        hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+        hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
        stopTest(start, stop, "FirstKernelLaunch", 1);
    }


    if (p_tests & 0x2) {
        hipEventRecord(start);
-        hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+        hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
        stopTest(start, stop, "SecondKernelLaunch", 1);
    }

@@ -110,7 +110,7 @@ int main() {
        for (int t = 0; t < TEST_ITERS; t++) {
            hipEventRecord(start);
            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
                hipEventRecord(sync);
                hipEventSynchronize(sync);
            }
@@ -123,7 +123,7 @@ int main() {
        for (int t = 0; t < TEST_ITERS; t++) {
            hipEventRecord(start);
            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
                hipEventRecord(sync);
                hipEventSynchronize(sync);
            }
@@ -137,7 +137,7 @@ int main() {
        for (int t = 0; t < TEST_ITERS; t++) {
            hipEventRecord(start);
            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
            }
            stopTest(start, stop, "NullStreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
        }
@@ -147,7 +147,7 @@ int main() {
        for (int t = 0; t < TEST_ITERS; t++) {
            hipEventRecord(start);
            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
            }
            stopTest(start, stop, "StreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
        }
@@ -36,8 +36,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -86,7 +85,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                    gpuMatrix, WIDTH);

@@ -21,8 +21,7 @@ In order to use the HIP framework, we need to add the "hip_runtime.h" header fil
 ## Device-side code
 We will work on device side code first, Here is simple example showing a snippet of HIP device side code:

-`__global__ void matrixTranspose(hipLaunchParm lp,            `
-`                                float *out,                  `
+`__global__ void matrixTranspose(float *out,                  `
 `                                float *in,                   `
 `                                const int width,             `
 `                                const int height)            `
@@ -41,11 +40,9 @@ other function-type qualifiers are:
 `__host__` can combine with `__device__`, in which case the function compiles for both the host and device. These functions cannot use the HIP grid coordinate functions (for example, "hipThreadIdx_x", will talk about it latter). A possible workaround is to pass the necessary coordinate info as an argument to the function.
 `__host__` cannot combine with `__global__`.

-`__global__` functions are often referred to as *kernels, and calling one is termed *launching the kernel*. 
+`__global__` functions are often referred to as *kernels*, and calling one is termed *launching the kernel*. 

-Next keyword is `void`. HIP `__global__` functions must have a `void` return type, and the first parameter to a HIP `__global__` function must have the type `hipLaunchParm`, which is for execution configuration. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.
-
-After `hipLaunchParm`, Kernel arguments follows next(i.e.,  `float *out, float *in, const int width, const int height`).
+Next keyword is `void`. HIP `__global__` functions must have a `void` return type. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.

 The kernel function begins with 
 `    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;`
@@ -63,15 +60,15 @@ We allocated memory to the Matrix on host side by using malloc and initiallized
 here the first parameter is the destination pointer, second is the source pointer, third is the size of memory copy and the last specify the direction on memory copy(which is in this case froom host to device). While in order to transfer memory from device to host, use `hipMemcpyDeviceToHost` and for device to device memory copy use `hipMemcpyDeviceToDevice`.

 Now, we'll see how to launch the kernel.
-`  hipLaunchKernel(matrixTranspose,                                                         `
+`  hipLaunchKernelGGL(matrixTranspose,                                                      `
 `                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),             `
 `                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),                          `
 `                  0, 0,                                                                    `
 `                  gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT);                          `

 HIP introduces a standard C++ calling convention to pass the execution configuration to the kernel (this convention replaces the `Cuda <<< >>>` syntax). In HIP,
- Kernels launch with the `"hipLaunchKernel"` function
- The first five parameters to hipLaunchKernel are the following:
+- Kernels launch with the `"hipLaunchKernelGGL"` function
+- The first five parameters to hipLaunchKernelGGL are the following:
   - **symbol kernelName**: the name of the kernel to launch.  To support template kernels which contains "," use the HIP_KERNEL_NAME macro. In current application it's "matrixTranspose".
   - **dim3 gridDim**: 3D-grid dimensions specifying the number of blocks to launch. In MatrixTranspose sample, it's "dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y)". 
   - **dim3 blockDim**: 3D-block dimensions specifying the number of threads in each block.In MatrixTranspose sample, it's "dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y)". 
@@ -34,8 +34,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -103,7 +102,7 @@ int main() {
    hipEventRecord(start, NULL);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                    gpuMatrix, WIDTH);

@@ -36,8 +36,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -86,7 +85,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                    gpuMatrix, WIDTH);

@@ -41,7 +41,7 @@ Now, we'll have the operation for which we need to compute the time taken. For t
 `  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);`

 and for kernel execution time we'll use `hipKernelLaunch`:
-`  hipLaunchKernel(matrixTranspose,                                                    `
+`  hipLaunchKernelGGL(matrixTranspose,                                                    `
 `                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),        `
 `                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),                     `
 `                  0, 0,                                                               `
@@ -34,8 +34,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -103,7 +102,7 @@ int main() {
    hipEventRecord(start, NULL);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                    gpuMatrix, WIDTH);

@@ -41,8 +41,7 @@ int startTriggerIteration = -1;
 int stopTriggerIteration = -1;

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -98,7 +97,7 @@ void runGPU(float* Matrix, float* TransposeMatrix, float* gpuMatrix, float* gpuT
        hipEventRecord(start, NULL);

        // Lauching kernel from host
-        hipLaunchKernel(matrixTranspose,
+        hipLaunchKernelGGL(matrixTranspose,
                        dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                        dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                        gpuMatrix, WIDTH);
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    __shared__ float sharedMem[WIDTH * WIDTH];

    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@@ -91,7 +90,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                    gpuMatrix, WIDTH);

@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;

    float val = in[x];
@@ -88,7 +87,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
+    hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
                    gpuTransposeMatrix, gpuMatrix, WIDTH);

    // Memory transfer from device to host
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
    float val = in[y * width + x];
@@ -86,7 +85,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0,
+    hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0,
                    gpuTransposeMatrix, gpuMatrix, WIDTH);

    // Memory transfer from device to host
@@ -25,7 +25,7 @@ Shared memory is way more faster than that of global and constant memory and acc
 here the first parameter is the data type while the second one is the variable name.

 The other important change is:
-`  hipLaunchKernel(matrixTranspose,                                                   `
+`  hipLaunchKernelGGL(matrixTranspose,                                                   `
                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                  sizeof(float)*WIDTH*WIDTH, 0,
@@ -33,8 +33,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    // declare dynamic shared memory
    HIP_DYNAMIC_SHARED(float, sharedMem);

@@ -90,7 +89,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
                    0, gpuTransposeMatrix, gpuMatrix, WIDTH);

@@ -26,15 +26,15 @@ and create stream using `hipStreamCreate` as follows:
 `  for(int i=0;i<num_streams;i++)                                                `
 `  hipStreamCreate(&streams[i]);                                                 `

-and while kernel launch, we make the following changes in 5th parameter to hipLaunchKernel(having 0 as the default stream value):
+and while kernel launch, we make the following changes in 5th parameter to hipLaunchKernelGGL(having 0 as the default stream value):

-`  hipLaunchKernel(matrixTranspose_static_shared,                               `
+`  hipLaunchKernelGGL(matrixTranspose_static_shared,                             `
                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                    0, streams[0],
                    gpuTransposeMatrix[0], data[0], width);

-`  hipLaunchKernel(matrixTranspose_dynamic_shared,                              `
+`  hipLaunchKernelGGL(matrixTranspose_dynamic_shared,                            `
                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                    sizeof(float)*WIDTH*WIDTH, streams[1],
@@ -30,7 +30,7 @@ THE SOFTWARE.

 using namespace std;

-__global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_static_shared(float* out, float* in,
                                              const int width) {
    __shared__ float sharedMem[WIDTH * WIDTH];

@@ -44,7 +44,7 @@ __global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, floa
    out[y * width + x] = sharedMem[y * width + x];
 }

-__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_dynamic_shared(float* out, float* in,
                                               const int width) {
    // declare dynamic shared memory
    HIP_DYNAMIC_SHARED(float, sharedMem)
@@ -71,12 +71,12 @@ void MultipleStream(float** data, float* randArray, float** gpuTransposeMatrix,
        hipMemcpyAsync(data[i], randArray, NUM * sizeof(float), hipMemcpyHostToDevice, streams[i]);
    }

-    hipLaunchKernel(matrixTranspose_static_shared,
+    hipLaunchKernelGGL(matrixTranspose_static_shared,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, streams[0],
                    gpuTransposeMatrix[0], data[0], width);

-    hipLaunchKernel(matrixTranspose_dynamic_shared,
+    hipLaunchKernelGGL(matrixTranspose_dynamic_shared,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
                    streams[1], gpuTransposeMatrix[1], data[1], width);
@@ -105,7 +105,7 @@ void disablePeer2Peer(int currentGpu, int peerGpu) {
 }


-__global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_static_shared(float* out, float* in,
                                              const int width) {
    __shared__ float sharedMem[WIDTH * WIDTH];

@@ -119,7 +119,7 @@ __global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, floa
    out[y * width + x] = sharedMem[y * width + x];
 }

-__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_dynamic_shared(float* out, float* in,
                                               const int width) {
    // declare dynamic shared memory
    HIP_DYNAMIC_SHARED(float, sharedMem)
@@ -170,7 +170,7 @@ int main() {
    hipMalloc((void**)&data[0], NUM * sizeof(float));
    hipMemcpy(data[0], randArray, NUM * sizeof(float), hipMemcpyHostToDevice);

-    hipLaunchKernel(matrixTranspose_static_shared,
+    hipLaunchKernelGGL(matrixTranspose_static_shared,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix[0],
                    data[0], width);
@@ -181,7 +181,7 @@ int main() {
    hipMalloc((void**)&data[1], NUM * sizeof(float));
    hipMemcpy(data[1], gpuTransposeMatrix[0], NUM * sizeof(float), hipMemcpyDeviceToDevice);

-    hipLaunchKernel(matrixTranspose_dynamic_shared,
+    hipLaunchKernelGGL(matrixTranspose_dynamic_shared,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
                    0, gpuTransposeMatrix[1], data[1], width);
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    float val = in[x];

@@ -88,7 +87,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
+    hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
                    gpuTransposeMatrix, gpuMatrix, WIDTH);

    // Memory transfer from device to host