From bfceb147514bdfdfe0198207c9fbd531087a12cb Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Wed, 17 Oct 2018 12:01:44 +0530
Subject: [PATCH] Replace hipLaunchKernel -> hipLaunchKernelGGL

Change-Id: I4d99009e1199811d417becf1e1b934ec4d4e30be
---
 samples/0_Intro/bit_extract/bit_extract.cpp       |  4 ++--
 samples/0_Intro/hcc_dialects/vadd_hip.cpp         |  4 ++--
 samples/0_Intro/square/square.hipref.cpp          |  4 ++--
 samples/1_Utils/hipCommander/hipCommander.cpp     |  2 +-
 samples/1_Utils/hipCommander/nullkernel.hip.cpp   |  2 +-
 samples/1_Utils/hipCommander/testcase.cpp         |  4 ++--
 .../hipDispatchLatency/hipDispatchLatency.cpp     | 14 +++++++-------
 .../0_MatrixTranspose/MatrixTranspose.cpp         |  5 ++---
 samples/2_Cookbook/0_MatrixTranspose/Readme.md    | 15 ++++++---------
 samples/2_Cookbook/10_inline_asm/inline_asm.cpp   |  5 ++---
 .../MatrixTranspose.cpp                           |  5 ++---
 samples/2_Cookbook/1_hipEvent/Readme.md           |  2 +-
 samples/2_Cookbook/1_hipEvent/hipEvent.cpp        |  5 ++---
 samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp |  5 ++---
 .../2_Cookbook/3_shared_memory/sharedMemory.cpp   |  5 ++---
 samples/2_Cookbook/4_shfl/shfl.cpp                |  5 ++---
 samples/2_Cookbook/5_2dshfl/2dshfl.cpp            |  5 ++---
 samples/2_Cookbook/6_dynamic_shared/Readme.md     |  2 +-
 .../6_dynamic_shared/dynamic_shared.cpp           |  5 ++---
 samples/2_Cookbook/7_streams/Readme.md            |  6 +++---
 samples/2_Cookbook/7_streams/stream.cpp           |  8 ++++----
 samples/2_Cookbook/8_peer2peer/peer2peer.cpp      |  8 ++++----
 samples/2_Cookbook/9_unroll/unroll.cpp            |  5 ++---
 23 files changed, 56 insertions(+), 69 deletions(-)

diff --git a/samples/0_Intro/bit_extract/bit_extract.cpp b/samples/0_Intro/bit_extract/bit_extract.cpp
index be7c5b020f..ab7a4b35a6 100644
--- a/samples/0_Intro/bit_extract/bit_extract.cpp
+++ b/samples/0_Intro/bit_extract/bit_extract.cpp
@@ -38,7 +38,7 @@ THE SOFTWARE.
         }                                                                                          \
     }
 
-__global__ void bit_extract_kernel(hipLaunchParm lp, uint32_t* C_d, const uint32_t* A_d, size_t N) {
+__global__ void bit_extract_kernel(uint32_t* C_d, const uint32_t* A_d, size_t N) {
     size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
     size_t stride = hipBlockDim_x * hipGridDim_x;
 
@@ -85,7 +85,7 @@ int main(int argc, char* argv[]) {
     printf("info: launch 'bit_extract_kernel' \n");
     const unsigned blocks = 512;
     const unsigned threadsPerBlock = 256;
-    hipLaunchKernel(bit_extract_kernel, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
+    hipLaunchKernelGGL(bit_extract_kernel, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
 
     printf("info: copy Device2Host\n");
     CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
diff --git a/samples/0_Intro/hcc_dialects/vadd_hip.cpp b/samples/0_Intro/hcc_dialects/vadd_hip.cpp
index 5022ef823e..366ab17d99 100644
--- a/samples/0_Intro/hcc_dialects/vadd_hip.cpp
+++ b/samples/0_Intro/hcc_dialects/vadd_hip.cpp
@@ -22,7 +22,7 @@ THE SOFTWARE.
 
 #include "hip/hip_runtime.h"
 
-__global__ void vadd_hip(hipLaunchParm lp, const float* a, const float* b, float* c, int N) {
+__global__ void vadd_hip(const float* a, const float* b, float* c, int N) {
     int idx = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
 
     if (idx < N) {
@@ -60,7 +60,7 @@ int main(int argc, char* argv[]) {
     // Launch kernel onto default accelerator
     int blockSize = 256;                                      // pick arbitrary block size
     int blocks = (sizeElements + blockSize - 1) / blockSize;  // round up to launch enough blocks
-    hipLaunchKernel(vadd_hip, dim3(blocks), dim3(blockSize), 0, 0, A_d, B_d, C_d, sizeElements);
+    hipLaunchKernelGGL(vadd_hip, dim3(blocks), dim3(blockSize), 0, 0, A_d, B_d, C_d, sizeElements);
 
     // D2H Copy
     hipMemcpy(C_h, C_d, sizeBytes, hipMemcpyDeviceToHost);
diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp
index cc8758bd5c..9bc41b59ab 100644
--- a/samples/0_Intro/square/square.hipref.cpp
+++ b/samples/0_Intro/square/square.hipref.cpp
@@ -37,7 +37,7 @@ THE SOFTWARE.
  * Square each element in the array A and write to array C.
  */
 template <typename T>
-__global__ void vector_square(hipLaunchParm lp, T* C_d, const T* A_d, size_t N) {
+__global__ void vector_square(T* C_d, const T* A_d, size_t N) {
     size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
     size_t stride = hipBlockDim_x * hipGridDim_x;
 
@@ -81,7 +81,7 @@ int main(int argc, char* argv[]) {
     const unsigned threadsPerBlock = 256;
 
     printf("info: launch 'vector_square' kernel\n");
-    hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
+    hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
 
     printf("info: copy Device2Host\n");
     CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
diff --git a/samples/1_Utils/hipCommander/hipCommander.cpp b/samples/1_Utils/hipCommander/hipCommander.cpp
index 345c2d5006..457cdfa7d3 100644
--- a/samples/1_Utils/hipCommander/hipCommander.cpp
+++ b/samples/1_Utils/hipCommander/hipCommander.cpp
@@ -434,7 +434,7 @@ class KernelCommand : public Command {
 
         switch (_kind) {
             case Null:
-                hipLaunchKernel(NullKernel, dim3(gridX / groupX), dim3(gridX), 0, _stream, nullptr);
+                hipLaunchKernelGGL(NullKernel, dim3(gridX / groupX), dim3(gridX), 0, _stream, nullptr);
                 break;
             case VectorAdd:
                 assert(0);  // TODO
diff --git a/samples/1_Utils/hipCommander/nullkernel.hip.cpp b/samples/1_Utils/hipCommander/nullkernel.hip.cpp
index 410796c7e6..8016f109c7 100644
--- a/samples/1_Utils/hipCommander/nullkernel.hip.cpp
+++ b/samples/1_Utils/hipCommander/nullkernel.hip.cpp
@@ -1,6 +1,6 @@
 #include "hip/hip_runtime.h"
 
-extern "C" __global__ void NullKernel(hipLaunchParm lp, float* Ad) {
+extern "C" __global__ void NullKernel(float* Ad) {
     if (Ad) {
         Ad[0] = 42;
     }
diff --git a/samples/1_Utils/hipCommander/testcase.cpp b/samples/1_Utils/hipCommander/testcase.cpp
index 93ebcf40c1..9be1c0c644 100644
--- a/samples/1_Utils/hipCommander/testcase.cpp
+++ b/samples/1_Utils/hipCommander/testcase.cpp
@@ -3,7 +3,7 @@
 static const int BLOCKSIZEX = 32;
 static const int BLOCKSIZEY = 16;
 
-__global__ void fails(hipLaunchParm lp, float* pErrorI) {
+__global__ void fails(float* pErrorI) {
     if (pErrorI != 0) {
         pErrorI[0] = 1;
     }
@@ -14,5 +14,5 @@ int main() {
     dim3 threads(BLOCKSIZEX, BLOCKSIZEY);
     float error;
 
-    hipLaunchKernel(HIP_KERNEL_NAME(fails), blocks, threads, 0, 0, &error);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(fails), blocks, threads, 0, 0, &error);
 }
diff --git a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp
index 7aa8fa4992..d2abd9023e 100644
--- a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp
+++ b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp
@@ -48,7 +48,7 @@ const unsigned p_tests = 0xfffffff;
 
 
 // HCC optimizes away fully NULL kernel calls, so run one that is nearly null:
-__global__ void NearlyNull(hipLaunchParm lp, float* Ad) {
+__global__ void NearlyNull(float* Ad) {
     if (Ad) {
         Ad[0] = 42;
     }
@@ -94,14 +94,14 @@ int main() {
 
     if (p_tests & 0x1) {
         hipEventRecord(start);
-        hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+        hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
         stopTest(start, stop, "FirstKernelLaunch", 1);
     }
 
 
     if (p_tests & 0x2) {
         hipEventRecord(start);
-        hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+        hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
         stopTest(start, stop, "SecondKernelLaunch", 1);
     }
 
@@ -110,7 +110,7 @@ int main() {
         for (int t = 0; t < TEST_ITERS; t++) {
             hipEventRecord(start);
             for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
                 hipEventRecord(sync);
                 hipEventSynchronize(sync);
             }
@@ -123,7 +123,7 @@ int main() {
         for (int t = 0; t < TEST_ITERS; t++) {
             hipEventRecord(start);
             for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
                 hipEventRecord(sync);
                 hipEventSynchronize(sync);
             }
@@ -137,7 +137,7 @@ int main() {
         for (int t = 0; t < TEST_ITERS; t++) {
             hipEventRecord(start);
             for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
             }
             stopTest(start, stop, "NullStreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
         }
@@ -147,7 +147,7 @@ int main() {
         for (int t = 0; t < TEST_ITERS; t++) {
             hipEventRecord(start);
             for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
             }
             stopTest(start, stop, "StreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
         }
diff --git a/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp
index bd56dbe992..79fd72bcf9 100644
--- a/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp
+++ b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp
@@ -36,8 +36,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
@@ -86,7 +85,7 @@ int main() {
     hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                     gpuMatrix, WIDTH);
 
diff --git a/samples/2_Cookbook/0_MatrixTranspose/Readme.md b/samples/2_Cookbook/0_MatrixTranspose/Readme.md
index ab5dbdc958..4a52b862a0 100644
--- a/samples/2_Cookbook/0_MatrixTranspose/Readme.md
+++ b/samples/2_Cookbook/0_MatrixTranspose/Readme.md
@@ -21,8 +21,7 @@ In order to use the HIP framework, we need to add the "hip_runtime.h" header fil
 ## Device-side code
 We will work on device side code first, Here is simple example showing a snippet of HIP device side code:
 
-`__global__ void matrixTranspose(hipLaunchParm lp,            `
-`                                float *out,                  `
+`__global__ void matrixTranspose(float *out,                  `
 `                                float *in,                   `
 `                                const int width,             `
 `                                const int height)            `
@@ -41,11 +40,9 @@ other function-type qualifiers are:
 `__host__` can combine with `__device__`, in which case the function compiles for both the host and device. These functions cannot use the HIP grid coordinate functions (for example, "hipThreadIdx_x", will talk about it latter). A possible workaround is to pass the necessary coordinate info as an argument to the function.
 `__host__` cannot combine with `__global__`.
 
-`__global__` functions are often referred to as *kernels, and calling one is termed *launching the kernel*. 
+`__global__` functions are often referred to as *kernels*, and calling one is termed *launching the kernel*. 
 
-Next keyword is `void`. HIP `__global__` functions must have a `void` return type, and the first parameter to a HIP `__global__` function must have the type `hipLaunchParm`, which is for execution configuration. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.
-
-After `hipLaunchParm`, Kernel arguments follows next(i.e.,  `float *out, float *in, const int width, const int height`).
+Next keyword is `void`. HIP `__global__` functions must have a `void` return type. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.
 
 The kernel function begins with 
 `    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;`
@@ -63,15 +60,15 @@ We allocated memory to the Matrix on host side by using malloc and initiallized
 here the first parameter is the destination pointer, second is the source pointer, third is the size of memory copy and the last specify the direction on memory copy(which is in this case froom host to device). While in order to transfer memory from device to host, use `hipMemcpyDeviceToHost` and for device to device memory copy use `hipMemcpyDeviceToDevice`.
 
 Now, we'll see how to launch the kernel.
-`  hipLaunchKernel(matrixTranspose,                                                         `
+`  hipLaunchKernelGGL(matrixTranspose,                                                      `
 `                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),             `
 `                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),                          `
 `                  0, 0,                                                                    `
 `                  gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT);                          `
 
 HIP introduces a standard C++ calling convention to pass the execution configuration to the kernel (this convention replaces the `Cuda <<< >>>` syntax). In HIP,
-- Kernels launch with the `"hipLaunchKernel"` function
-- The first five parameters to hipLaunchKernel are the following:
+- Kernels launch with the `"hipLaunchKernelGGL"` function
+- The first five parameters to hipLaunchKernelGGL are the following:
    - **symbol kernelName**: the name of the kernel to launch.  To support template kernels which contains "," use the HIP_KERNEL_NAME macro. In current application it's "matrixTranspose".
    - **dim3 gridDim**: 3D-grid dimensions specifying the number of blocks to launch. In MatrixTranspose sample, it's "dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y)". 
    - **dim3 blockDim**: 3D-block dimensions specifying the number of threads in each block.In MatrixTranspose sample, it's "dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y)". 
diff --git a/samples/2_Cookbook/10_inline_asm/inline_asm.cpp b/samples/2_Cookbook/10_inline_asm/inline_asm.cpp
index d9aee9c0a8..c47486e91a 100644
--- a/samples/2_Cookbook/10_inline_asm/inline_asm.cpp
+++ b/samples/2_Cookbook/10_inline_asm/inline_asm.cpp
@@ -34,8 +34,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
@@ -103,7 +102,7 @@ int main() {
     hipEventRecord(start, NULL);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                     gpuMatrix, WIDTH);
 
diff --git a/samples/2_Cookbook/12_cmake_hip_add_executable/MatrixTranspose.cpp b/samples/2_Cookbook/12_cmake_hip_add_executable/MatrixTranspose.cpp
index bd56dbe992..79fd72bcf9 100644
--- a/samples/2_Cookbook/12_cmake_hip_add_executable/MatrixTranspose.cpp
+++ b/samples/2_Cookbook/12_cmake_hip_add_executable/MatrixTranspose.cpp
@@ -36,8 +36,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
@@ -86,7 +85,7 @@ int main() {
     hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                     gpuMatrix, WIDTH);
 
diff --git a/samples/2_Cookbook/1_hipEvent/Readme.md b/samples/2_Cookbook/1_hipEvent/Readme.md
index ea4f3a67e9..7f276f628f 100644
--- a/samples/2_Cookbook/1_hipEvent/Readme.md
+++ b/samples/2_Cookbook/1_hipEvent/Readme.md
@@ -41,7 +41,7 @@ Now, we'll have the operation for which we need to compute the time taken. For t
 `  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);`
 
 and for kernel execution time we'll use `hipKernelLaunch`:
-`  hipLaunchKernel(matrixTranspose,                                                    `
+`  hipLaunchKernelGGL(matrixTranspose,                                                    `
 `                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),        `
 `                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),                     `
 `                  0, 0,                                                               `
diff --git a/samples/2_Cookbook/1_hipEvent/hipEvent.cpp b/samples/2_Cookbook/1_hipEvent/hipEvent.cpp
index e9d4e4dd7a..6e778d3b2b 100644
--- a/samples/2_Cookbook/1_hipEvent/hipEvent.cpp
+++ b/samples/2_Cookbook/1_hipEvent/hipEvent.cpp
@@ -34,8 +34,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
@@ -103,7 +102,7 @@ int main() {
     hipEventRecord(start, NULL);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                     gpuMatrix, WIDTH);
 
diff --git a/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp b/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp
index edf4fe2f51..94174529c1 100644
--- a/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp
+++ b/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp
@@ -41,8 +41,7 @@ int startTriggerIteration = -1;
 int stopTriggerIteration = -1;
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
@@ -98,7 +97,7 @@ void runGPU(float* Matrix, float* TransposeMatrix, float* gpuMatrix, float* gpuT
         hipEventRecord(start, NULL);
 
         // Lauching kernel from host
-        hipLaunchKernel(matrixTranspose,
+        hipLaunchKernelGGL(matrixTranspose,
                         dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                         dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                         gpuMatrix, WIDTH);
diff --git a/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp b/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp
index d2b6758757..e5f2d1f62c 100644
--- a/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp
+++ b/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     __shared__ float sharedMem[WIDTH * WIDTH];
 
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@@ -91,7 +90,7 @@ int main() {
     hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                     gpuMatrix, WIDTH);
 
diff --git a/samples/2_Cookbook/4_shfl/shfl.cpp b/samples/2_Cookbook/4_shfl/shfl.cpp
index c48065ab27..c0223b8156 100644
--- a/samples/2_Cookbook/4_shfl/shfl.cpp
+++ b/samples/2_Cookbook/4_shfl/shfl.cpp
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
 
     float val = in[x];
@@ -88,7 +87,7 @@ int main() {
     hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
+    hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
                     gpuTransposeMatrix, gpuMatrix, WIDTH);
 
     // Memory transfer from device to host
diff --git a/samples/2_Cookbook/5_2dshfl/2dshfl.cpp b/samples/2_Cookbook/5_2dshfl/2dshfl.cpp
index 15201e9ace..9e267803ca 100644
--- a/samples/2_Cookbook/5_2dshfl/2dshfl.cpp
+++ b/samples/2_Cookbook/5_2dshfl/2dshfl.cpp
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
     float val = in[y * width + x];
@@ -86,7 +85,7 @@ int main() {
     hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0,
+    hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0,
                     gpuTransposeMatrix, gpuMatrix, WIDTH);
 
     // Memory transfer from device to host
diff --git a/samples/2_Cookbook/6_dynamic_shared/Readme.md b/samples/2_Cookbook/6_dynamic_shared/Readme.md
index 15ea299a9c..9622ac85fe 100644
--- a/samples/2_Cookbook/6_dynamic_shared/Readme.md
+++ b/samples/2_Cookbook/6_dynamic_shared/Readme.md
@@ -25,7 +25,7 @@ Shared memory is way more faster than that of global and constant memory and acc
 here the first parameter is the data type while the second one is the variable name.
 
 The other important change is:
-`  hipLaunchKernel(matrixTranspose,                                                   `
+`  hipLaunchKernelGGL(matrixTranspose,                                                   `
                   dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
                   dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                   sizeof(float)*WIDTH*WIDTH, 0,
diff --git a/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp b/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp
index 5462f89dca..47df51f82b 100644
--- a/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp
+++ b/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp
@@ -33,8 +33,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     // declare dynamic shared memory
     HIP_DYNAMIC_SHARED(float, sharedMem);
 
@@ -90,7 +89,7 @@ int main() {
     hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
                     0, gpuTransposeMatrix, gpuMatrix, WIDTH);
 
diff --git a/samples/2_Cookbook/7_streams/Readme.md b/samples/2_Cookbook/7_streams/Readme.md
index ca295d3f49..456deada1f 100644
--- a/samples/2_Cookbook/7_streams/Readme.md
+++ b/samples/2_Cookbook/7_streams/Readme.md
@@ -26,15 +26,15 @@ and create stream using `hipStreamCreate` as follows:
 `  for(int i=0;i<num_streams;i++)                                                `
 `  hipStreamCreate(&streams[i]);                                                 `
 
-and while kernel launch, we make the following changes in 5th parameter to hipLaunchKernel(having 0 as the default stream value):
+and while kernel launch, we make the following changes in 5th parameter to hipLaunchKernelGGL(having 0 as the default stream value):
 
-`  hipLaunchKernel(matrixTranspose_static_shared,                               `
+`  hipLaunchKernelGGL(matrixTranspose_static_shared,                             `
                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                     0, streams[0],
                     gpuTransposeMatrix[0], data[0], width);
 
-`  hipLaunchKernel(matrixTranspose_dynamic_shared,                              `
+`  hipLaunchKernelGGL(matrixTranspose_dynamic_shared,                            `
                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                     sizeof(float)*WIDTH*WIDTH, streams[1],
diff --git a/samples/2_Cookbook/7_streams/stream.cpp b/samples/2_Cookbook/7_streams/stream.cpp
index eb19aa9e58..69983630ef 100644
--- a/samples/2_Cookbook/7_streams/stream.cpp
+++ b/samples/2_Cookbook/7_streams/stream.cpp
@@ -30,7 +30,7 @@ THE SOFTWARE.
 
 using namespace std;
 
-__global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_static_shared(float* out, float* in,
                                               const int width) {
     __shared__ float sharedMem[WIDTH * WIDTH];
 
@@ -44,7 +44,7 @@ __global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, floa
     out[y * width + x] = sharedMem[y * width + x];
 }
 
-__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_dynamic_shared(float* out, float* in,
                                                const int width) {
     // declare dynamic shared memory
     HIP_DYNAMIC_SHARED(float, sharedMem)
@@ -71,12 +71,12 @@ void MultipleStream(float** data, float* randArray, float** gpuTransposeMatrix,
         hipMemcpyAsync(data[i], randArray, NUM * sizeof(float), hipMemcpyHostToDevice, streams[i]);
     }
 
-    hipLaunchKernel(matrixTranspose_static_shared,
+    hipLaunchKernelGGL(matrixTranspose_static_shared,
                     dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, streams[0],
                     gpuTransposeMatrix[0], data[0], width);
 
-    hipLaunchKernel(matrixTranspose_dynamic_shared,
+    hipLaunchKernelGGL(matrixTranspose_dynamic_shared,
                     dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
                     streams[1], gpuTransposeMatrix[1], data[1], width);
diff --git a/samples/2_Cookbook/8_peer2peer/peer2peer.cpp b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp
index 898d308a9e..1870c016a0 100644
--- a/samples/2_Cookbook/8_peer2peer/peer2peer.cpp
+++ b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp
@@ -105,7 +105,7 @@ void disablePeer2Peer(int currentGpu, int peerGpu) {
 }
 
 
-__global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_static_shared(float* out, float* in,
                                               const int width) {
     __shared__ float sharedMem[WIDTH * WIDTH];
 
@@ -119,7 +119,7 @@ __global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, floa
     out[y * width + x] = sharedMem[y * width + x];
 }
 
-__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_dynamic_shared(float* out, float* in,
                                                const int width) {
     // declare dynamic shared memory
     HIP_DYNAMIC_SHARED(float, sharedMem)
@@ -170,7 +170,7 @@ int main() {
     hipMalloc((void**)&data[0], NUM * sizeof(float));
     hipMemcpy(data[0], randArray, NUM * sizeof(float), hipMemcpyHostToDevice);
 
-    hipLaunchKernel(matrixTranspose_static_shared,
+    hipLaunchKernelGGL(matrixTranspose_static_shared,
                     dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix[0],
                     data[0], width);
@@ -181,7 +181,7 @@ int main() {
     hipMalloc((void**)&data[1], NUM * sizeof(float));
     hipMemcpy(data[1], gpuTransposeMatrix[0], NUM * sizeof(float), hipMemcpyDeviceToDevice);
 
-    hipLaunchKernel(matrixTranspose_dynamic_shared,
+    hipLaunchKernelGGL(matrixTranspose_dynamic_shared,
                     dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
                     0, gpuTransposeMatrix[1], data[1], width);
diff --git a/samples/2_Cookbook/9_unroll/unroll.cpp b/samples/2_Cookbook/9_unroll/unroll.cpp
index 512a6d3f22..cd32feff54 100644
--- a/samples/2_Cookbook/9_unroll/unroll.cpp
+++ b/samples/2_Cookbook/9_unroll/unroll.cpp
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     float val = in[x];
 
@@ -88,7 +87,7 @@ int main() {
     hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
+    hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
                     gpuTransposeMatrix, gpuMatrix, WIDTH);
 
     // Memory transfer from device to host