From 52e320f3963939ecfafd92c8160aa516110f7786 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Wed, 17 Oct 2018 12:01:44 +0530
Subject: [PATCH] Replace hipLaunchKernel -> hipLaunchKernelGGL

Change-Id: I4d99009e1199811d417becf1e1b934ec4d4e30be
---
 samples/0_Intro/bit_extract/bit_extract.cpp   |  4 +-
 samples/0_Intro/hcc_dialects/vadd_hip.cpp     |  4 +-
 samples/0_Intro/square/square.hipref.cpp      |  4 +-
 samples/1_Utils/hipCommander/hipCommander.cpp |  2 +-
 .../1_Utils/hipCommander/nullkernel.hip.cpp   |  2 +-
 samples/1_Utils/hipCommander/testcase.cpp     |  4 +-
 .../hipDispatchLatency/hipDispatchLatency.cpp | 14 ++---
 .../0_MatrixTranspose/MatrixTranspose.cpp     |  5 +-
 .../2_Cookbook/0_MatrixTranspose/Readme.md    | 15 ++---
 .../2_Cookbook/10_inline_asm/inline_asm.cpp   |  5 +-
 .../MatrixTranspose.cpp                       |  5 +-
 samples/2_Cookbook/1_hipEvent/Readme.md       |  2 +-
 samples/2_Cookbook/1_hipEvent/hipEvent.cpp    |  5 +-
 .../2_Cookbook/2_Profiler/MatrixTranspose.cpp |  5 +-
 .../3_shared_memory/sharedMemory.cpp          |  5 +-
 samples/2_Cookbook/4_shfl/shfl.cpp            |  5 +-
 samples/2_Cookbook/5_2dshfl/2dshfl.cpp        |  5 +-
 samples/2_Cookbook/6_dynamic_shared/Readme.md |  2 +-
 .../6_dynamic_shared/dynamic_shared.cpp       |  5 +-
 samples/2_Cookbook/7_streams/Readme.md        |  6 +-
 samples/2_Cookbook/7_streams/stream.cpp       |  8 +--
 samples/2_Cookbook/8_peer2peer/peer2peer.cpp  |  8 +--
 samples/2_Cookbook/9_unroll/unroll.cpp        |  5 +-
 .../device/hipFuncDeviceSynchronize.cpp       |  4 +-
 tests/src/deviceLib/hipDeviceMemcpy.cpp       |  8 +--
 .../hipDoublePrecisionIntrinsics.cpp          |  4 +-
 tests/src/deviceLib/hipFloatMath.cpp          |  4 +-
 tests/src/deviceLib/hipIntegerIntrinsics.cpp  |  4 +-
 tests/src/deviceLib/hipMathFunctions.cpp      |  8 +--
 .../hipSinglePrecisionIntrinsics.cpp          |  4 +-
 .../hipSinglePrecisionMathDevice.cpp          |  4 +-
 tests/src/deviceLib/hipStdComplex.cpp         |  4 +-
 tests/src/deviceLib/hipTestDeviceDouble.cpp   | 56 +++++++++----------
 tests/src/deviceLib/hipTestDeviceSymbol.cpp   |  8 +--
 tests/src/deviceLib/hipThreadFence.cpp        |  4 +-
 tests/src/deviceLib/hip_anyall.cpp            |  4 +-
 tests/src/deviceLib/hip_ballot.cpp            |  4 +-
 tests/src/deviceLib/hip_bitextract.cpp        |  5 +-
 tests/src/deviceLib/hip_bitinsert.cpp         |  4 +-
 tests/src/deviceLib/hip_brev.cpp              |  4 +-
 tests/src/deviceLib/hip_clz.cpp               |  4 +-
 tests/src/deviceLib/hip_ffs.cpp               |  4 +-
 tests/src/deviceLib/hip_mbcnt.cpp             |  4 +-
 tests/src/deviceLib/hip_popc.cpp              |  4 +-
 tests/src/deviceLib/hip_test_ldg.cpp          |  8 +--
 tests/src/deviceLib/hip_trig.cpp              |  4 +-
 tests/src/hipC.c                              |  4 +-
 tests/src/kernel/hipDynamicShared.cpp         |  4 +-
 tests/src/kernel/hipDynamicShared2.cpp        |  4 +-
 tests/src/kernel/hipEmptyKernel.cpp           |  4 +-
 tests/src/kernel/hipGridLaunch.cpp            |  4 +-
 tests/src/kernel/hipLanguageExtensions.cpp    | 14 ++---
 tests/src/kernel/hipLaunchParm.cpp            |  4 +-
 tests/src/kernel/hipPrintfKernel.cpp          |  4 +-
 tests/src/kernel/hipTestConstant.cpp          |  4 +-
 tests/src/kernel/hipTestMallocKernel.cpp      |  8 +--
 tests/src/kernel/hipTestMemKernel.cpp         | 40 ++++++-------
 tests/src/kernel/inline_asm_vadd.cpp          |  4 +-
 .../device/hipDeviceSynchronize.cpp           |  4 +-
 tests/src/runtimeApi/event/hipEventRecord.cpp |  2 +-
 tests/src/runtimeApi/memory/hipArray.cpp      |  6 +-
 .../src/runtimeApi/memory/hipHostGetFlags.cpp |  4 +-
 .../src/runtimeApi/memory/hipHostRegister.cpp |  4 +-
 tests/src/runtimeApi/memory/hipMemcpy.cpp     |  2 +-
 tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp |  4 +-
 .../runtimeApi/memory/hipMemcpyDtoDAsync.cpp  |  4 +-
 tests/src/runtimeApi/memory/hipMemcpyPeer.cpp |  4 +-
 .../runtimeApi/memory/hipMemcpyPeerAsync.cpp  |  4 +-
 .../runtimeApi/memory/hipMemcpy_simple.cpp    |  2 +-
 .../memory/hipMemoryAllocateCoherent.cpp      |  4 +-
 .../multiThread/hipMultiThreadStreams1.cpp    |  4 +-
 .../multiThread/hipMultiThreadStreams2.cpp    |  8 +--
 .../runtimeApi/stream/hipAPIStreamDisable.cpp |  8 +--
 .../runtimeApi/stream/hipAPIStreamEnable.cpp  |  8 +--
 tests/src/runtimeApi/stream/hipNullStream.cpp |  8 +--
 tests/src/runtimeApi/stream/hipStream.h       |  2 +-
 tests/src/runtimeApi/stream/hipStreamL5.cpp   | 40 ++++++-------
 tests/src/test_common.h                       |  4 +-
 78 files changed, 246 insertions(+), 264 deletions(-)

diff --git a/samples/0_Intro/bit_extract/bit_extract.cpp b/samples/0_Intro/bit_extract/bit_extract.cpp
index be7c5b020f..ab7a4b35a6 100644
--- a/samples/0_Intro/bit_extract/bit_extract.cpp
+++ b/samples/0_Intro/bit_extract/bit_extract.cpp
@@ -38,7 +38,7 @@ THE SOFTWARE.
         }                                                                                          \
     }
 
-__global__ void bit_extract_kernel(hipLaunchParm lp, uint32_t* C_d, const uint32_t* A_d, size_t N) {
+__global__ void bit_extract_kernel(uint32_t* C_d, const uint32_t* A_d, size_t N) {
     size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
     size_t stride = hipBlockDim_x * hipGridDim_x;
 
@@ -85,7 +85,7 @@ int main(int argc, char* argv[]) {
     printf("info: launch 'bit_extract_kernel' \n");
     const unsigned blocks = 512;
     const unsigned threadsPerBlock = 256;
-    hipLaunchKernel(bit_extract_kernel, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
+    hipLaunchKernelGGL(bit_extract_kernel, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
 
     printf("info: copy Device2Host\n");
     CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
diff --git a/samples/0_Intro/hcc_dialects/vadd_hip.cpp b/samples/0_Intro/hcc_dialects/vadd_hip.cpp
index 5022ef823e..366ab17d99 100644
--- a/samples/0_Intro/hcc_dialects/vadd_hip.cpp
+++ b/samples/0_Intro/hcc_dialects/vadd_hip.cpp
@@ -22,7 +22,7 @@ THE SOFTWARE.
 
 #include "hip/hip_runtime.h"
 
-__global__ void vadd_hip(hipLaunchParm lp, const float* a, const float* b, float* c, int N) {
+__global__ void vadd_hip(const float* a, const float* b, float* c, int N) {
     int idx = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
 
     if (idx < N) {
@@ -60,7 +60,7 @@ int main(int argc, char* argv[]) {
     // Launch kernel onto default accelerator
     int blockSize = 256;                                      // pick arbitrary block size
     int blocks = (sizeElements + blockSize - 1) / blockSize;  // round up to launch enough blocks
-    hipLaunchKernel(vadd_hip, dim3(blocks), dim3(blockSize), 0, 0, A_d, B_d, C_d, sizeElements);
+    hipLaunchKernelGGL(vadd_hip, dim3(blocks), dim3(blockSize), 0, 0, A_d, B_d, C_d, sizeElements);
 
     // D2H Copy
     hipMemcpy(C_h, C_d, sizeBytes, hipMemcpyDeviceToHost);
diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp
index cc8758bd5c..9bc41b59ab 100644
--- a/samples/0_Intro/square/square.hipref.cpp
+++ b/samples/0_Intro/square/square.hipref.cpp
@@ -37,7 +37,7 @@ THE SOFTWARE.
  * Square each element in the array A and write to array C.
  */
 template <typename T>
-__global__ void vector_square(hipLaunchParm lp, T* C_d, const T* A_d, size_t N) {
+__global__ void vector_square(T* C_d, const T* A_d, size_t N) {
     size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
     size_t stride = hipBlockDim_x * hipGridDim_x;
 
@@ -81,7 +81,7 @@ int main(int argc, char* argv[]) {
     const unsigned threadsPerBlock = 256;
 
     printf("info: launch 'vector_square' kernel\n");
-    hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
+    hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
 
     printf("info: copy Device2Host\n");
     CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
diff --git a/samples/1_Utils/hipCommander/hipCommander.cpp b/samples/1_Utils/hipCommander/hipCommander.cpp
index 345c2d5006..457cdfa7d3 100644
--- a/samples/1_Utils/hipCommander/hipCommander.cpp
+++ b/samples/1_Utils/hipCommander/hipCommander.cpp
@@ -434,7 +434,7 @@ class KernelCommand : public Command {
 
         switch (_kind) {
             case Null:
-                hipLaunchKernel(NullKernel, dim3(gridX / groupX), dim3(gridX), 0, _stream, nullptr);
+                hipLaunchKernelGGL(NullKernel, dim3(gridX / groupX), dim3(gridX), 0, _stream, nullptr);
                 break;
             case VectorAdd:
                 assert(0);  // TODO
diff --git a/samples/1_Utils/hipCommander/nullkernel.hip.cpp b/samples/1_Utils/hipCommander/nullkernel.hip.cpp
index 410796c7e6..8016f109c7 100644
--- a/samples/1_Utils/hipCommander/nullkernel.hip.cpp
+++ b/samples/1_Utils/hipCommander/nullkernel.hip.cpp
@@ -1,6 +1,6 @@
 #include "hip/hip_runtime.h"
 
-extern "C" __global__ void NullKernel(hipLaunchParm lp, float* Ad) {
+extern "C" __global__ void NullKernel(float* Ad) {
     if (Ad) {
         Ad[0] = 42;
     }
diff --git a/samples/1_Utils/hipCommander/testcase.cpp b/samples/1_Utils/hipCommander/testcase.cpp
index 93ebcf40c1..9be1c0c644 100644
--- a/samples/1_Utils/hipCommander/testcase.cpp
+++ b/samples/1_Utils/hipCommander/testcase.cpp
@@ -3,7 +3,7 @@
 static const int BLOCKSIZEX = 32;
 static const int BLOCKSIZEY = 16;
 
-__global__ void fails(hipLaunchParm lp, float* pErrorI) {
+__global__ void fails(float* pErrorI) {
     if (pErrorI != 0) {
         pErrorI[0] = 1;
     }
@@ -14,5 +14,5 @@ int main() {
     dim3 threads(BLOCKSIZEX, BLOCKSIZEY);
     float error;
 
-    hipLaunchKernel(HIP_KERNEL_NAME(fails), blocks, threads, 0, 0, &error);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(fails), blocks, threads, 0, 0, &error);
 }
diff --git a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp
index 7aa8fa4992..d2abd9023e 100644
--- a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp
+++ b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp
@@ -48,7 +48,7 @@ const unsigned p_tests = 0xfffffff;
 
 
 // HCC optimizes away fully NULL kernel calls, so run one that is nearly null:
-__global__ void NearlyNull(hipLaunchParm lp, float* Ad) {
+__global__ void NearlyNull(float* Ad) {
     if (Ad) {
         Ad[0] = 42;
     }
@@ -94,14 +94,14 @@ int main() {
 
     if (p_tests & 0x1) {
         hipEventRecord(start);
-        hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+        hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
         stopTest(start, stop, "FirstKernelLaunch", 1);
     }
 
 
     if (p_tests & 0x2) {
         hipEventRecord(start);
-        hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+        hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
         stopTest(start, stop, "SecondKernelLaunch", 1);
     }
 
@@ -110,7 +110,7 @@ int main() {
         for (int t = 0; t < TEST_ITERS; t++) {
             hipEventRecord(start);
             for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
                 hipEventRecord(sync);
                 hipEventSynchronize(sync);
             }
@@ -123,7 +123,7 @@ int main() {
         for (int t = 0; t < TEST_ITERS; t++) {
             hipEventRecord(start);
             for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
                 hipEventRecord(sync);
                 hipEventSynchronize(sync);
             }
@@ -137,7 +137,7 @@ int main() {
         for (int t = 0; t < TEST_ITERS; t++) {
             hipEventRecord(start);
             for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
             }
             stopTest(start, stop, "NullStreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
         }
@@ -147,7 +147,7 @@ int main() {
         for (int t = 0; t < TEST_ITERS; t++) {
             hipEventRecord(start);
             for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
             }
             stopTest(start, stop, "StreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
         }
diff --git a/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp
index bd56dbe992..79fd72bcf9 100644
--- a/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp
+++ b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp
@@ -36,8 +36,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
@@ -86,7 +85,7 @@ int main() {
     hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                     gpuMatrix, WIDTH);
 
diff --git a/samples/2_Cookbook/0_MatrixTranspose/Readme.md b/samples/2_Cookbook/0_MatrixTranspose/Readme.md
index ab5dbdc958..4a52b862a0 100644
--- a/samples/2_Cookbook/0_MatrixTranspose/Readme.md
+++ b/samples/2_Cookbook/0_MatrixTranspose/Readme.md
@@ -21,8 +21,7 @@ In order to use the HIP framework, we need to add the "hip_runtime.h" header fil
 ## Device-side code
 We will work on device side code first, Here is simple example showing a snippet of HIP device side code:
 
-`__global__ void matrixTranspose(hipLaunchParm lp,            `
-`                                float *out,                  `
+`__global__ void matrixTranspose(float *out,                  `
 `                                float *in,                   `
 `                                const int width,             `
 `                                const int height)            `
@@ -41,11 +40,9 @@ other function-type qualifiers are:
 `__host__` can combine with `__device__`, in which case the function compiles for both the host and device. These functions cannot use the HIP grid coordinate functions (for example, "hipThreadIdx_x", will talk about it latter). A possible workaround is to pass the necessary coordinate info as an argument to the function.
 `__host__` cannot combine with `__global__`.
 
-`__global__` functions are often referred to as *kernels, and calling one is termed *launching the kernel*. 
+`__global__` functions are often referred to as *kernels*, and calling one is termed *launching the kernel*. 
 
-Next keyword is `void`. HIP `__global__` functions must have a `void` return type, and the first parameter to a HIP `__global__` function must have the type `hipLaunchParm`, which is for execution configuration. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.
-
-After `hipLaunchParm`, Kernel arguments follows next(i.e.,  `float *out, float *in, const int width, const int height`).
+Next keyword is `void`. HIP `__global__` functions must have a `void` return type. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.
 
 The kernel function begins with 
 `    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;`
@@ -63,15 +60,15 @@ We allocated memory to the Matrix on host side by using malloc and initiallized
 here the first parameter is the destination pointer, second is the source pointer, third is the size of memory copy and the last specify the direction on memory copy(which is in this case froom host to device). While in order to transfer memory from device to host, use `hipMemcpyDeviceToHost` and for device to device memory copy use `hipMemcpyDeviceToDevice`.
 
 Now, we'll see how to launch the kernel.
-`  hipLaunchKernel(matrixTranspose,                                                         `
+`  hipLaunchKernelGGL(matrixTranspose,                                                      `
 `                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),             `
 `                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),                          `
 `                  0, 0,                                                                    `
 `                  gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT);                          `
 
 HIP introduces a standard C++ calling convention to pass the execution configuration to the kernel (this convention replaces the `Cuda <<< >>>` syntax). In HIP,
-- Kernels launch with the `"hipLaunchKernel"` function
-- The first five parameters to hipLaunchKernel are the following:
+- Kernels launch with the `"hipLaunchKernelGGL"` function
+- The first five parameters to hipLaunchKernelGGL are the following:
    - **symbol kernelName**: the name of the kernel to launch.  To support template kernels which contains "," use the HIP_KERNEL_NAME macro. In current application it's "matrixTranspose".
    - **dim3 gridDim**: 3D-grid dimensions specifying the number of blocks to launch. In MatrixTranspose sample, it's "dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y)". 
    - **dim3 blockDim**: 3D-block dimensions specifying the number of threads in each block.In MatrixTranspose sample, it's "dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y)". 
diff --git a/samples/2_Cookbook/10_inline_asm/inline_asm.cpp b/samples/2_Cookbook/10_inline_asm/inline_asm.cpp
index d9aee9c0a8..c47486e91a 100644
--- a/samples/2_Cookbook/10_inline_asm/inline_asm.cpp
+++ b/samples/2_Cookbook/10_inline_asm/inline_asm.cpp
@@ -34,8 +34,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
@@ -103,7 +102,7 @@ int main() {
     hipEventRecord(start, NULL);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                     gpuMatrix, WIDTH);
 
diff --git a/samples/2_Cookbook/12_cmake_hip_add_executable/MatrixTranspose.cpp b/samples/2_Cookbook/12_cmake_hip_add_executable/MatrixTranspose.cpp
index bd56dbe992..79fd72bcf9 100644
--- a/samples/2_Cookbook/12_cmake_hip_add_executable/MatrixTranspose.cpp
+++ b/samples/2_Cookbook/12_cmake_hip_add_executable/MatrixTranspose.cpp
@@ -36,8 +36,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
@@ -86,7 +85,7 @@ int main() {
     hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                     gpuMatrix, WIDTH);
 
diff --git a/samples/2_Cookbook/1_hipEvent/Readme.md b/samples/2_Cookbook/1_hipEvent/Readme.md
index ea4f3a67e9..7f276f628f 100644
--- a/samples/2_Cookbook/1_hipEvent/Readme.md
+++ b/samples/2_Cookbook/1_hipEvent/Readme.md
@@ -41,7 +41,7 @@ Now, we'll have the operation for which we need to compute the time taken. For t
 `  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);`
 
 and for kernel execution time we'll use `hipKernelLaunch`:
-`  hipLaunchKernel(matrixTranspose,                                                    `
+`  hipLaunchKernelGGL(matrixTranspose,                                                    `
 `                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),        `
 `                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),                     `
 `                  0, 0,                                                               `
diff --git a/samples/2_Cookbook/1_hipEvent/hipEvent.cpp b/samples/2_Cookbook/1_hipEvent/hipEvent.cpp
index e9d4e4dd7a..6e778d3b2b 100644
--- a/samples/2_Cookbook/1_hipEvent/hipEvent.cpp
+++ b/samples/2_Cookbook/1_hipEvent/hipEvent.cpp
@@ -34,8 +34,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
@@ -103,7 +102,7 @@ int main() {
     hipEventRecord(start, NULL);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                     gpuMatrix, WIDTH);
 
diff --git a/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp b/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp
index edf4fe2f51..94174529c1 100644
--- a/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp
+++ b/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp
@@ -41,8 +41,7 @@ int startTriggerIteration = -1;
 int stopTriggerIteration = -1;
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
@@ -98,7 +97,7 @@ void runGPU(float* Matrix, float* TransposeMatrix, float* gpuMatrix, float* gpuT
         hipEventRecord(start, NULL);
 
         // Lauching kernel from host
-        hipLaunchKernel(matrixTranspose,
+        hipLaunchKernelGGL(matrixTranspose,
                         dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                         dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                         gpuMatrix, WIDTH);
diff --git a/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp b/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp
index d2b6758757..e5f2d1f62c 100644
--- a/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp
+++ b/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     __shared__ float sharedMem[WIDTH * WIDTH];
 
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@@ -91,7 +90,7 @@ int main() {
     hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                     gpuMatrix, WIDTH);
 
diff --git a/samples/2_Cookbook/4_shfl/shfl.cpp b/samples/2_Cookbook/4_shfl/shfl.cpp
index c48065ab27..c0223b8156 100644
--- a/samples/2_Cookbook/4_shfl/shfl.cpp
+++ b/samples/2_Cookbook/4_shfl/shfl.cpp
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
 
     float val = in[x];
@@ -88,7 +87,7 @@ int main() {
     hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
+    hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
                     gpuTransposeMatrix, gpuMatrix, WIDTH);
 
     // Memory transfer from device to host
diff --git a/samples/2_Cookbook/5_2dshfl/2dshfl.cpp b/samples/2_Cookbook/5_2dshfl/2dshfl.cpp
index 15201e9ace..9e267803ca 100644
--- a/samples/2_Cookbook/5_2dshfl/2dshfl.cpp
+++ b/samples/2_Cookbook/5_2dshfl/2dshfl.cpp
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
     float val = in[y * width + x];
@@ -86,7 +85,7 @@ int main() {
     hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0,
+    hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0,
                     gpuTransposeMatrix, gpuMatrix, WIDTH);
 
     // Memory transfer from device to host
diff --git a/samples/2_Cookbook/6_dynamic_shared/Readme.md b/samples/2_Cookbook/6_dynamic_shared/Readme.md
index 15ea299a9c..9622ac85fe 100644
--- a/samples/2_Cookbook/6_dynamic_shared/Readme.md
+++ b/samples/2_Cookbook/6_dynamic_shared/Readme.md
@@ -25,7 +25,7 @@ Shared memory is way more faster than that of global and constant memory and acc
 here the first parameter is the data type while the second one is the variable name.
 
 The other important change is:
-`  hipLaunchKernel(matrixTranspose,                                                   `
+`  hipLaunchKernelGGL(matrixTranspose,                                                   `
                   dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
                   dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                   sizeof(float)*WIDTH*WIDTH, 0,
diff --git a/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp b/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp
index 5462f89dca..47df51f82b 100644
--- a/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp
+++ b/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp
@@ -33,8 +33,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     // declare dynamic shared memory
     HIP_DYNAMIC_SHARED(float, sharedMem);
 
@@ -90,7 +89,7 @@ int main() {
     hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
                     0, gpuTransposeMatrix, gpuMatrix, WIDTH);
 
diff --git a/samples/2_Cookbook/7_streams/Readme.md b/samples/2_Cookbook/7_streams/Readme.md
index ca295d3f49..456deada1f 100644
--- a/samples/2_Cookbook/7_streams/Readme.md
+++ b/samples/2_Cookbook/7_streams/Readme.md
@@ -26,15 +26,15 @@ and create stream using `hipStreamCreate` as follows:
 `  for(int i=0;i<num_streams;i++)                                                `
 `  hipStreamCreate(&streams[i]);                                                 `
 
-and while kernel launch, we make the following changes in 5th parameter to hipLaunchKernel(having 0 as the default stream value):
+and while kernel launch, we make the following changes in 5th parameter to hipLaunchKernelGGL(having 0 as the default stream value):
 
-`  hipLaunchKernel(matrixTranspose_static_shared,                               `
+`  hipLaunchKernelGGL(matrixTranspose_static_shared,                             `
                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                     0, streams[0],
                     gpuTransposeMatrix[0], data[0], width);
 
-`  hipLaunchKernel(matrixTranspose_dynamic_shared,                              `
+`  hipLaunchKernelGGL(matrixTranspose_dynamic_shared,                            `
                     dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                     sizeof(float)*WIDTH*WIDTH, streams[1],
diff --git a/samples/2_Cookbook/7_streams/stream.cpp b/samples/2_Cookbook/7_streams/stream.cpp
index eb19aa9e58..69983630ef 100644
--- a/samples/2_Cookbook/7_streams/stream.cpp
+++ b/samples/2_Cookbook/7_streams/stream.cpp
@@ -30,7 +30,7 @@ THE SOFTWARE.
 
 using namespace std;
 
-__global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_static_shared(float* out, float* in,
                                               const int width) {
     __shared__ float sharedMem[WIDTH * WIDTH];
 
@@ -44,7 +44,7 @@ __global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, floa
     out[y * width + x] = sharedMem[y * width + x];
 }
 
-__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_dynamic_shared(float* out, float* in,
                                                const int width) {
     // declare dynamic shared memory
     HIP_DYNAMIC_SHARED(float, sharedMem)
@@ -71,12 +71,12 @@ void MultipleStream(float** data, float* randArray, float** gpuTransposeMatrix,
         hipMemcpyAsync(data[i], randArray, NUM * sizeof(float), hipMemcpyHostToDevice, streams[i]);
     }
 
-    hipLaunchKernel(matrixTranspose_static_shared,
+    hipLaunchKernelGGL(matrixTranspose_static_shared,
                     dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, streams[0],
                     gpuTransposeMatrix[0], data[0], width);
 
-    hipLaunchKernel(matrixTranspose_dynamic_shared,
+    hipLaunchKernelGGL(matrixTranspose_dynamic_shared,
                     dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
                     streams[1], gpuTransposeMatrix[1], data[1], width);
diff --git a/samples/2_Cookbook/8_peer2peer/peer2peer.cpp b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp
index 898d308a9e..1870c016a0 100644
--- a/samples/2_Cookbook/8_peer2peer/peer2peer.cpp
+++ b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp
@@ -105,7 +105,7 @@ void disablePeer2Peer(int currentGpu, int peerGpu) {
 }
 
 
-__global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_static_shared(float* out, float* in,
                                               const int width) {
     __shared__ float sharedMem[WIDTH * WIDTH];
 
@@ -119,7 +119,7 @@ __global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, floa
     out[y * width + x] = sharedMem[y * width + x];
 }
 
-__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_dynamic_shared(float* out, float* in,
                                                const int width) {
     // declare dynamic shared memory
     HIP_DYNAMIC_SHARED(float, sharedMem)
@@ -170,7 +170,7 @@ int main() {
     hipMalloc((void**)&data[0], NUM * sizeof(float));
     hipMemcpy(data[0], randArray, NUM * sizeof(float), hipMemcpyHostToDevice);
 
-    hipLaunchKernel(matrixTranspose_static_shared,
+    hipLaunchKernelGGL(matrixTranspose_static_shared,
                     dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix[0],
                     data[0], width);
@@ -181,7 +181,7 @@ int main() {
     hipMalloc((void**)&data[1], NUM * sizeof(float));
     hipMemcpy(data[1], gpuTransposeMatrix[0], NUM * sizeof(float), hipMemcpyDeviceToDevice);
 
-    hipLaunchKernel(matrixTranspose_dynamic_shared,
+    hipLaunchKernelGGL(matrixTranspose_dynamic_shared,
                     dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
                     0, gpuTransposeMatrix[1], data[1], width);
diff --git a/samples/2_Cookbook/9_unroll/unroll.cpp b/samples/2_Cookbook/9_unroll/unroll.cpp
index 512a6d3f22..cd32feff54 100644
--- a/samples/2_Cookbook/9_unroll/unroll.cpp
+++ b/samples/2_Cookbook/9_unroll/unroll.cpp
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1
 
 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
     int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
     float val = in[x];
 
@@ -88,7 +87,7 @@ int main() {
     hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);
 
     // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
+    hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
                     gpuTransposeMatrix, gpuMatrix, WIDTH);
 
     // Memory transfer from device to host
diff --git a/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp b/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp
index 3de04a2213..30866da654 100644
--- a/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp
+++ b/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #define _SIZE sizeof(int) * 1024 * 1024
 #define NUM_STREAMS 2
 
-__global__ void Iter(hipLaunchParm lp, int* Ad, int num) {
+__global__ void Iter(int* Ad, int num) {
     int tx = threadIdx.x + blockIdx.x * blockDim.x;
     // Kernel loop designed to execute very slowly... ... ...   so we can test timing-related
     // behavior below
@@ -58,7 +58,7 @@ int main() {
         HIPCHECK(hipMemcpyAsync(Ad[i], A[i], _SIZE, hipMemcpyHostToDevice, stream[i]));
     }
     for (int i = 0; i < NUM_STREAMS; i++) {
-        hipLaunchKernel(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, stream[i], Ad[i], 1 << 30);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, stream[i], Ad[i], 1 << 30);
     }
     for (int i = 0; i < NUM_STREAMS; i++) {
         HIPCHECK(hipMemcpyAsync(A[i], Ad[i], _SIZE, hipMemcpyDeviceToHost, stream[i]));
diff --git a/tests/src/deviceLib/hipDeviceMemcpy.cpp b/tests/src/deviceLib/hipDeviceMemcpy.cpp
index 762f9615fd..6ebdebf6fc 100644
--- a/tests/src/deviceLib/hipDeviceMemcpy.cpp
+++ b/tests/src/deviceLib/hipDeviceMemcpy.cpp
@@ -14,12 +14,12 @@
  */
 
 
-__global__ void cpy(hipLaunchParm lp, uint32_t* Out, uint32_t* In) {
+__global__ void cpy(uint32_t* Out, uint32_t* In) {
     int tx = threadIdx.x;
     memcpy(Out + tx, In + tx, sizeof(uint32_t));
 }
 
-__global__ void set(hipLaunchParm lp, uint32_t* ptr, uint8_t val, size_t size) {
+__global__ void set(uint32_t* ptr, uint8_t val, size_t size) {
     int tx = threadIdx.x;
     memset(ptr + tx, val, sizeof(uint32_t));
 }
@@ -39,7 +39,7 @@ int main() {
     hipMalloc((void**)&Bd, SIZE);
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
 
-    hipLaunchKernel(cpy, dim3(1), dim3(LEN), 0, 0, Bd, Ad);
+    hipLaunchKernelGGL(cpy, dim3(1), dim3(LEN), 0, 0, Bd, Ad);
 
     hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
     for (int i = LEN - 16; i < LEN; i++) {
@@ -47,7 +47,7 @@ int main() {
             return 0;
         }
     }
-    hipLaunchKernel(set, dim3(1), dim3(LEN), 0, 0, Bd, 0x1, LEN);
+    hipLaunchKernelGGL(set, dim3(1), dim3(LEN), 0, 0, Bd, 0x1, LEN);
 
     hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
     for (int i = LEN - 16; i < LEN; i++) {
diff --git a/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp b/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp
index 76b7a5de7a..17cd82c9ab 100644
--- a/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp
+++ b/tests/src/deviceLib/hipDoublePrecisionIntrinsics.cpp
@@ -64,11 +64,11 @@ __device__ void double_precision_intrinsics() {
     __fma_rz(1.0, 2.0, 3.0);
 }
 
-__global__ void compileDoublePrecisionIntrinsics(hipLaunchParm lp, int ignored) {
+__global__ void compileDoublePrecisionIntrinsics(int ignored) {
     double_precision_intrinsics();
 }
 
 int main() {
-    hipLaunchKernel(compileDoublePrecisionIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
+    hipLaunchKernelGGL(compileDoublePrecisionIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
     passed();
 }
diff --git a/tests/src/deviceLib/hipFloatMath.cpp b/tests/src/deviceLib/hipFloatMath.cpp
index d363efc271..ee83309f28 100644
--- a/tests/src/deviceLib/hipFloatMath.cpp
+++ b/tests/src/deviceLib/hipFloatMath.cpp
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #define SIZE LEN << 2
 
 
-__global__ void floatMath(hipLaunchParm lp, float* In, float* Out) {
+__global__ void floatMath(float* In, float* Out) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     Out[tid] = __cosf(In[tid]);
     Out[tid] = __exp10f(Out[tid]);
@@ -57,6 +57,6 @@ int main() {
     float *Ind, *Outd;
     hipMalloc((void**)&Ind, SIZE);
     hipMalloc((void**)&Outd, SIZE);
-    hipLaunchKernel(floatMath, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0, Ind, Outd);
+    hipLaunchKernelGGL(floatMath, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0, Ind, Outd);
     passed();
 }
diff --git a/tests/src/deviceLib/hipIntegerIntrinsics.cpp b/tests/src/deviceLib/hipIntegerIntrinsics.cpp
index 1e6a645b1a..f75e406e98 100644
--- a/tests/src/deviceLib/hipIntegerIntrinsics.cpp
+++ b/tests/src/deviceLib/hipIntegerIntrinsics.cpp
@@ -62,9 +62,9 @@ __device__ void integer_intrinsics() {
     assert(1);
 }
 
-__global__ void compileIntegerIntrinsics(hipLaunchParm lp, int ignored) { integer_intrinsics(); }
+__global__ void compileIntegerIntrinsics(int ignored) { integer_intrinsics(); }
 
 int main() {
-    hipLaunchKernel(compileIntegerIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
+    hipLaunchKernelGGL(compileIntegerIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
     passed();
 }
diff --git a/tests/src/deviceLib/hipMathFunctions.cpp b/tests/src/deviceLib/hipMathFunctions.cpp
index f89bdae149..6a3d5501e0 100644
--- a/tests/src/deviceLib/hipMathFunctions.cpp
+++ b/tests/src/deviceLib/hipMathFunctions.cpp
@@ -31,12 +31,12 @@ THE SOFTWARE.
 
 #if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__
 
-__global__ void kernel_abs_int64(hipLaunchParm lp, long long *input, long long *output) {
+__global__ void kernel_abs_int64(long long *input, long long *output) {
     int tx = threadIdx.x;
     output[tx] = abs(input[tx]);
 }
 
-__global__ void kernel_lgamma_double(hipLaunchParm lp, double *input, double *output) {
+__global__ void kernel_lgamma_double(double *input, double *output) {
     int tx = threadIdx.x;
     output[tx] = lgamma(input[tx]);
 }
@@ -79,7 +79,7 @@ void check_lgamma_double() {
   hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);
 
   // launch kernel
-  hipLaunchKernel(kernel_lgamma_double, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
+  hipLaunchKernelGGL(kernel_lgamma_double, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
 
   // copy outputs from device
   hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
@@ -127,7 +127,7 @@ void check_abs_int64() {
   hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);
 
   // launch kernel
-  hipLaunchKernel(kernel_abs_int64, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
+  hipLaunchKernelGGL(kernel_abs_int64, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
 
   // copy outputs from device
   hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
diff --git a/tests/src/deviceLib/hipSinglePrecisionIntrinsics.cpp b/tests/src/deviceLib/hipSinglePrecisionIntrinsics.cpp
index dab153ed5e..cad06fe4ec 100644
--- a/tests/src/deviceLib/hipSinglePrecisionIntrinsics.cpp
+++ b/tests/src/deviceLib/hipSinglePrecisionIntrinsics.cpp
@@ -79,12 +79,12 @@ __device__ void single_precision_intrinsics() {
 }
 
 
-__global__ void compileSinglePrecisionIntrinsics(hipLaunchParm lp, int ignored) {
+__global__ void compileSinglePrecisionIntrinsics(int ignored) {
     single_precision_intrinsics();
 }
 
 
 int main() {
-    hipLaunchKernel(compileSinglePrecisionIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
+    hipLaunchKernelGGL(compileSinglePrecisionIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
     passed();
 }
diff --git a/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp b/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp
index 493ff77323..15c0e6c1e3 100644
--- a/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp
+++ b/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp
@@ -129,11 +129,11 @@ __device__ void single_precision_math_functions() {
     ynf(1, 1.0f);
 }
 
-__global__ void compileSinglePrecisionMathOnDevice(hipLaunchParm lp, int ignored) {
+__global__ void compileSinglePrecisionMathOnDevice(int ignored) {
     single_precision_math_functions();
 }
 
 int main() {
-    hipLaunchKernel(compileSinglePrecisionMathOnDevice, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
+    hipLaunchKernelGGL(compileSinglePrecisionMathOnDevice, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
     passed();
 }
diff --git a/tests/src/deviceLib/hipStdComplex.cpp b/tests/src/deviceLib/hipStdComplex.cpp
index 594b0c6bdd..5d91c574ec 100644
--- a/tests/src/deviceLib/hipStdComplex.cpp
+++ b/tests/src/deviceLib/hipStdComplex.cpp
@@ -84,7 +84,7 @@ __device__ __host__ std::complex<FloatT> calc(std::complex<FloatT> A,
 }
 
 template<typename FloatT>
-__global__ void kernel(hipLaunchParm lp, std::complex<FloatT>* A,
+__global__ void kernel(std::complex<FloatT>* A,
                        std::complex<FloatT>* B, std::complex<FloatT>* C,
                        enum CalcKind CK) {
     int tx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -114,7 +114,7 @@ void test() {
     // Run kernel for a calculation kind and verify by comparing with host
     // calculation result. Returns false if fails.
     auto test_fun = [&](enum CalcKind CK) {
-      hipLaunchKernel(kernel<FloatT>, dim3(1), dim3(LEN), 0, 0, Ad, Bd, Cd, CK);
+      hipLaunchKernelGGL(kernel<FloatT>, dim3(1), dim3(LEN), 0, 0, Ad, Bd, Cd, CK);
       hipMemcpy(C, Cd, sizeof(ComplexT)*LEN, hipMemcpyDeviceToHost);
       for (int i = 0; i < LEN; i++) {
         ComplexT Expected = calc(A[i], B[i], CK);
diff --git a/tests/src/deviceLib/hipTestDeviceDouble.cpp b/tests/src/deviceLib/hipTestDeviceDouble.cpp
index cff8d1f28d..b29a9d2c20 100644
--- a/tests/src/deviceLib/hipTestDeviceDouble.cpp
+++ b/tests/src/deviceLib/hipTestDeviceDouble.cpp
@@ -31,74 +31,74 @@ THE SOFTWARE.
 #define N 512
 #define SIZE N * sizeof(double)
 
-__global__ void test_sincos(hipLaunchParm lp, double* a, double* b, double* c) {
+__global__ void test_sincos(double* a, double* b, double* c) {
     int tid = threadIdx.x;
     sincos(a[tid], b + tid, c + tid);
 }
 
-__global__ void test_sincospi(hipLaunchParm lp, double* a, double* b, double* c) {
+__global__ void test_sincospi(double* a, double* b, double* c) {
     int tid = threadIdx.x;
     sincospi(a[tid], b + tid, c + tid);
 }
 
-__global__ void test_llrint(hipLaunchParm lp, double* a, long long int* b) {
+__global__ void test_llrint(double* a, long long int* b) {
     int tid = threadIdx.x;
     b[tid] = llrint(a[tid]);
 }
 
-__global__ void test_lrint(hipLaunchParm lp, double* a, long int* b) {
+__global__ void test_lrint(double* a, long int* b) {
     int tid = threadIdx.x;
     b[tid] = lrint(a[tid]);
 }
 
-__global__ void test_rint(hipLaunchParm lp, double* a, double* b) {
+__global__ void test_rint(double* a, double* b) {
     int tid = threadIdx.x;
     b[tid] = rint(a[tid]);
 }
 
-__global__ void test_llround(hipLaunchParm lp, double* a, long long int* b) {
+__global__ void test_llround(double* a, long long int* b) {
     int tid = threadIdx.x;
     b[tid] = llround(a[tid]);
 }
 
-__global__ void test_lround(hipLaunchParm lp, double* a, long int* b) {
+__global__ void test_lround(double* a, long int* b) {
     int tid = threadIdx.x;
     b[tid] = lround(a[tid]);
 }
 
-__global__ void test_rhypot(hipLaunchParm lp, double* a, double* b, double* c) {
+__global__ void test_rhypot(double* a, double* b, double* c) {
     int tid = threadIdx.x;
     c[tid] = rhypot(a[tid], b[tid]);
 }
 
-__global__ void test_norm3d(hipLaunchParm lp, double* a, double* b, double* c, double* d) {
+__global__ void test_norm3d(double* a, double* b, double* c, double* d) {
     int tid = threadIdx.x;
     d[tid] = norm3d(a[tid], b[tid], c[tid]);
 }
 
-__global__ void test_norm4d(hipLaunchParm lp, double* a, double* b, double* c, double* d,
+__global__ void test_norm4d(double* a, double* b, double* c, double* d,
                             double* e) {
     int tid = threadIdx.x;
     e[tid] = norm4d(a[tid], b[tid], c[tid], d[tid]);
 }
 
-__global__ void test_rnorm3d(hipLaunchParm lp, double* a, double* b, double* c, double* d) {
+__global__ void test_rnorm3d(double* a, double* b, double* c, double* d) {
     int tid = threadIdx.x;
     d[tid] = rnorm3d(a[tid], b[tid], c[tid]);
 }
 
-__global__ void test_rnorm4d(hipLaunchParm lp, double* a, double* b, double* c, double* d,
+__global__ void test_rnorm4d(double* a, double* b, double* c, double* d,
                              double* e) {
     int tid = threadIdx.x;
     e[tid] = rnorm4d(a[tid], b[tid], c[tid], d[tid]);
 }
 
-__global__ void test_rnorm(hipLaunchParm lp, double* a, double* b) {
+__global__ void test_rnorm(double* a, double* b) {
     int tid = threadIdx.x;
     b[tid] = rnorm(N, a);
 }
 
-__global__ void test_erfinv(hipLaunchParm lp, double* a, double* b) {
+__global__ void test_erfinv(double* a, double* b) {
     int tid = threadIdx.x;
     b[tid] = erf(erfinv(a[tid]));
 }
@@ -115,7 +115,7 @@ bool run_sincos() {
     hipMalloc((void**)&Bd, SIZE);
     hipMalloc((void**)&Cd, SIZE);
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_sincos, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
+    hipLaunchKernelGGL(test_sincos, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
     hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
     hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
@@ -157,7 +157,7 @@ bool run_sincospi() {
     hipMalloc((void**)&Bd, SIZE);
     hipMalloc((void**)&Cd, SIZE);
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_sincospi, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
+    hipLaunchKernelGGL(test_sincospi, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
     hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
     hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
@@ -199,7 +199,7 @@ bool run_llrint() {
     hipMalloc((void**)&Ad, SIZE);
     hipMalloc((void**)&Bd, N * sizeof(long long int));
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_llrint, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_llrint, dim3(1), dim3(N), 0, 0, Ad, Bd);
     hipMemcpy(B, Bd, N * sizeof(long long int), hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -233,7 +233,7 @@ bool run_lrint() {
     hipMalloc((void**)&Ad, SIZE);
     hipMalloc((void**)&Bd, N * sizeof(long int));
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_lrint, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_lrint, dim3(1), dim3(N), 0, 0, Ad, Bd);
     hipMemcpy(B, Bd, N * sizeof(long int), hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -266,7 +266,7 @@ bool run_rint() {
     hipMalloc((void**)&Ad, SIZE);
     hipMalloc((void**)&Bd, SIZE);
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rint, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_rint, dim3(1), dim3(N), 0, 0, Ad, Bd);
     hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -300,7 +300,7 @@ bool run_llround() {
     hipMalloc((void**)&Ad, SIZE);
     hipMalloc((void**)&Bd, N * sizeof(long long int));
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_llround, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_llround, dim3(1), dim3(N), 0, 0, Ad, Bd);
     hipMemcpy(B, Bd, N * sizeof(long long int), hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -333,7 +333,7 @@ bool run_lround() {
     hipMalloc((void**)&Ad, SIZE);
     hipMalloc((void**)&Bd, N * sizeof(long int));
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_lround, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_lround, dim3(1), dim3(N), 0, 0, Ad, Bd);
     hipMemcpy(B, Bd, N * sizeof(long int), hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -376,7 +376,7 @@ bool run_norm3d() {
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_norm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
+    hipLaunchKernelGGL(test_norm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
     hipMemcpy(D, Dd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -425,7 +425,7 @@ bool run_norm4d() {
     hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Dd, D, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_norm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
+    hipLaunchKernelGGL(test_norm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
     hipMemcpy(E, Ed, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -469,7 +469,7 @@ bool run_rhypot() {
     hipMalloc((void**)&Cd, SIZE);
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rhypot, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
+    hipLaunchKernelGGL(test_rhypot, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
     hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -512,7 +512,7 @@ bool run_rnorm3d() {
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rnorm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
+    hipLaunchKernelGGL(test_rnorm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
     hipMemcpy(D, Dd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -561,7 +561,7 @@ bool run_rnorm4d() {
     hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Dd, D, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rnorm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
+    hipLaunchKernelGGL(test_rnorm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
     hipMemcpy(E, Ed, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -602,7 +602,7 @@ bool run_rnorm() {
     hipMalloc((void**)&Ad, SIZE);
     hipMalloc((void**)&Bd, SIZE);
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rnorm, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_rnorm, dim3(1), dim3(N), 0, 0, Ad, Bd);
     hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
@@ -634,7 +634,7 @@ bool run_erfinv() {
     hipMalloc((void**)&Ad, SIZE);
     hipMalloc((void**)&Bd, SIZE);
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_erfinv, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_erfinv, dim3(1), dim3(N), 0, 0, Ad, Bd);
     hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
     int passed = 0;
     for (int i = 0; i < 512; i++) {
diff --git a/tests/src/deviceLib/hipTestDeviceSymbol.cpp b/tests/src/deviceLib/hipTestDeviceSymbol.cpp
index 2c94da67be..4bac9a902b 100644
--- a/tests/src/deviceLib/hipTestDeviceSymbol.cpp
+++ b/tests/src/deviceLib/hipTestDeviceSymbol.cpp
@@ -34,7 +34,7 @@ THE SOFTWARE.
 __device__ int globalIn[NUM];
 __device__ int globalOut[NUM];
 
-__global__ void Assign(hipLaunchParm lp, int* Out) {
+__global__ void Assign(int* Out) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     Out[tid] = globalIn[tid];
     globalOut[tid] = globalIn[tid];
@@ -63,7 +63,7 @@ int main() {
     hipStreamCreate(&stream);
     hipMemcpyToSymbolAsync(HIP_SYMBOL(globalIn), Am, SIZE, 0, hipMemcpyHostToDevice, stream);
     hipStreamSynchronize(stream);
-    hipLaunchKernel(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
+    hipLaunchKernelGGL(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
     hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost);
     hipMemcpyFromSymbolAsync(Cm, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost, stream);
     hipStreamSynchronize(stream);
@@ -78,7 +78,7 @@ int main() {
     }
 
     hipMemcpyToSymbol(HIP_SYMBOL(globalIn), A, SIZE, 0, hipMemcpyHostToDevice);
-    hipLaunchKernel(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
+    hipLaunchKernelGGL(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
     hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost);
     hipMemcpyFromSymbol(C, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost);
     for (int i = 0; i < NUM; i++) {
@@ -93,7 +93,7 @@ int main() {
 
     hipMemcpyToSymbolAsync(HIP_SYMBOL(globalIn), A, SIZE, 0, hipMemcpyHostToDevice, stream);
     hipStreamSynchronize(stream);
-    hipLaunchKernel(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
+    hipLaunchKernelGGL(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
     hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost);
     hipMemcpyFromSymbolAsync(C, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost, stream);
     hipStreamSynchronize(stream);
diff --git a/tests/src/deviceLib/hipThreadFence.cpp b/tests/src/deviceLib/hipThreadFence.cpp
index b81d84a5d4..296d1519aa 100644
--- a/tests/src/deviceLib/hipThreadFence.cpp
+++ b/tests/src/deviceLib/hipThreadFence.cpp
@@ -31,7 +31,7 @@ THE SOFTWARE.
 #define NUM 1024
 #define SIZE NUM * sizeof(float)
 
-__global__ void vAdd(hipLaunchParm lp, float* In1, float* In2, float* In3, float* In4, float* Out) {
+__global__ void vAdd(float* In1, float* In2, float* In3, float* In4, float* Out) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     In4[tid] = In1[tid] + In2[tid];
     __threadfence();
@@ -66,7 +66,7 @@ int main() {
     hipMemcpy(In3d, In3, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(In4d, In4, SIZE, hipMemcpyHostToDevice);
 
-    hipLaunchKernel(vAdd, dim3(32, 1, 1), dim3(32, 1, 1), 0, 0, In1d, In2d, In3d, In4d, Outd);
+    hipLaunchKernelGGL(vAdd, dim3(32, 1, 1), dim3(32, 1, 1), 0, 0, In1d, In2d, In3d, In4d, Outd);
     hipMemcpy(Out, Outd, SIZE, hipMemcpyDeviceToHost);
     assert(Out[10] == 2 * In1[10] + 2 * In2[10] + In3[10]);
     passed();
diff --git a/tests/src/deviceLib/hip_anyall.cpp b/tests/src/deviceLib/hip_anyall.cpp
index 934c4dbcfd..6815ce8cbf 100644
--- a/tests/src/deviceLib/hip_anyall.cpp
+++ b/tests/src/deviceLib/hip_anyall.cpp
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #include <hip/device_functions.h>
 #define HIP_ASSERT(x) (assert((x) == hipSuccess))
 
-__global__ void warpvote(hipLaunchParm lp, int* device_any, int* device_all,
+__global__ void warpvote(int* device_any, int* device_all,
                          int Num_Warps_per_Block, int pshift) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     device_any[threadIdx.x >> pshift] = __any(tid - 77);
@@ -73,7 +73,7 @@ int main(int argc, char* argv[]) {
     HIP_ASSERT(hipMemcpy(device_any, host_any, sizeof(int), hipMemcpyHostToDevice));
     HIP_ASSERT(hipMemcpy(device_all, host_all, sizeof(int), hipMemcpyHostToDevice));
 
-    hipLaunchKernel(warpvote, dim3(Num_Blocks_per_Grid), dim3(Num_Threads_per_Block), 0, 0,
+    hipLaunchKernelGGL(warpvote, dim3(Num_Blocks_per_Grid), dim3(Num_Threads_per_Block), 0, 0,
                     device_any, device_all, Num_Warps_per_Block, pshift);
 
 
diff --git a/tests/src/deviceLib/hip_ballot.cpp b/tests/src/deviceLib/hip_ballot.cpp
index 603614b22f..92cfabbd36 100644
--- a/tests/src/deviceLib/hip_ballot.cpp
+++ b/tests/src/deviceLib/hip_ballot.cpp
@@ -30,7 +30,7 @@ THE SOFTWARE.
 
 #define HIP_ASSERT(x) (assert((x) == hipSuccess))
 
-__global__ void gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,
+__global__ void gpu_ballot(unsigned int* device_ballot, int Num_Warps_per_Block,
                            int pshift) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     const unsigned int warp_num = threadIdx.x >> pshift;
@@ -69,7 +69,7 @@ int main(int argc, char* argv[]) {
     HIP_ASSERT(hipMemcpy(device_ballot, host_ballot, Num_Warps_per_Grid * sizeof(unsigned int),
                          hipMemcpyHostToDevice));
 
-    hipLaunchKernel(gpu_ballot, dim3(Num_Blocks_per_Grid), dim3(Num_Threads_per_Block), 0, 0,
+    hipLaunchKernelGGL(gpu_ballot, dim3(Num_Blocks_per_Grid), dim3(Num_Threads_per_Block), 0, 0,
                     device_ballot, Num_Warps_per_Block, pshift);
 
 
diff --git a/tests/src/deviceLib/hip_bitextract.cpp b/tests/src/deviceLib/hip_bitextract.cpp
index 34e0be2b57..e791d66b34 100644
--- a/tests/src/deviceLib/hip_bitextract.cpp
+++ b/tests/src/deviceLib/hip_bitextract.cpp
@@ -53,8 +53,7 @@ T bit_extract(T src0, unsigned int src1, unsigned int src2) {
   }
 }
 
-__global__ void HIP_kernel(hipLaunchParm lp,
-                           unsigned int* out32, unsigned int* in32_0,
+__global__ void HIP_kernel(unsigned int* out32, unsigned int* in32_0,
                            unsigned int* in32_1, unsigned int* in32_2,
                            unsigned long long int* out64, unsigned long long int* in64_0,
                            unsigned int* in64_1, unsigned int* in64_2) {
@@ -150,7 +149,7 @@ int main() {
     HIP_ASSERT(hipMemcpy(deviceSrc264, hostSrc264, NUM * sizeof(unsigned int), hipMemcpyHostToDevice));
 
 
-    hipLaunchKernel(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block),
+    hipLaunchKernelGGL(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block),
                        0, 0,
                        deviceOut32, deviceSrc032, deviceSrc132, deviceSrc232,
                        deviceOut64, deviceSrc064, deviceSrc164, deviceSrc264);
diff --git a/tests/src/deviceLib/hip_bitinsert.cpp b/tests/src/deviceLib/hip_bitinsert.cpp
index 063281768c..bf00f4143a 100644
--- a/tests/src/deviceLib/hip_bitinsert.cpp
+++ b/tests/src/deviceLib/hip_bitinsert.cpp
@@ -50,7 +50,7 @@ T bit_insert(T src0, T src1, unsigned int src2, unsigned int src3) {
   return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
 }
 
-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* out32,
+__global__ void HIP_kernel(unsigned int* out32,
                            unsigned int* in32_0, unsigned int* in32_1,
                            unsigned int* in32_2, unsigned int* in32_3,
                            unsigned long long int* out64, unsigned long long int* in64_0,
@@ -161,7 +161,7 @@ int main() {
     HIP_ASSERT(hipMemcpy(deviceSrc364, hostSrc364, NUM * sizeof(unsigned int), hipMemcpyHostToDevice));
 
 
-    hipLaunchKernel(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block),
+    hipLaunchKernelGGL(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block),
                        0, 0,
                        deviceOut32, deviceSrc032, deviceSrc132, deviceSrc232, deviceSrc332,
                        deviceOut64, deviceSrc064, deviceSrc164, deviceSrc264, deviceSrc364);
diff --git a/tests/src/deviceLib/hip_brev.cpp b/tests/src/deviceLib/hip_brev.cpp
index af64884f9f..cc48cca31f 100644
--- a/tests/src/deviceLib/hip_brev.cpp
+++ b/tests/src/deviceLib/hip_brev.cpp
@@ -64,7 +64,7 @@ T bitreverse(T num) {
     return reverse_num;
 }
 
-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,
+__global__ void HIP_kernel(unsigned int* a, unsigned int* b,
                            unsigned long long int* c, unsigned long long int* d, int width,
                            int height) {
     int x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -124,7 +124,7 @@ int main() {
         hipMemcpy(deviceD, hostD, NUM * sizeof(unsigned long long int), hipMemcpyHostToDevice));
 
 
-    hipLaunchKernel(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB, deviceC,
                     deviceD, WIDTH, HEIGHT);
 
diff --git a/tests/src/deviceLib/hip_clz.cpp b/tests/src/deviceLib/hip_clz.cpp
index 604b62c7e2..77ba980fe3 100644
--- a/tests/src/deviceLib/hip_clz.cpp
+++ b/tests/src/deviceLib/hip_clz.cpp
@@ -82,7 +82,7 @@ __device__ void test_ambiguity() {
   __clzll(ui);
 }
 
-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b, unsigned int* c,
+__global__ void HIP_kernel(unsigned int* a, unsigned int* b, unsigned int* c,
                            unsigned long long int* d, int width, int height) {
     int x = blockDim.x * blockIdx.x + threadIdx.x;
     int y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -138,7 +138,7 @@ int main() {
     HIP_ASSERT(
         hipMemcpy(deviceD, hostD, NUM * sizeof(unsigned long long int), hipMemcpyHostToDevice));
 
-    hipLaunchKernel(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB, deviceC,
                     deviceD, WIDTH, HEIGHT);
 
diff --git a/tests/src/deviceLib/hip_ffs.cpp b/tests/src/deviceLib/hip_ffs.cpp
index 04cf38caa8..a877b113f1 100644
--- a/tests/src/deviceLib/hip_ffs.cpp
+++ b/tests/src/deviceLib/hip_ffs.cpp
@@ -59,7 +59,7 @@ int lastbit(T a) {
 }
 
 
-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b, unsigned int* c,
+__global__ void HIP_kernel(unsigned int* a, unsigned int* b, unsigned int* c,
                            unsigned long long int* d, int width, int height) {
     int x = blockDim.x * blockIdx.x + threadIdx.x;
     int y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -117,7 +117,7 @@ int main() {
     HIP_ASSERT(
         hipMemcpy(deviceD, hostD, NUM * sizeof(unsigned long long int), hipMemcpyHostToDevice));
 
-    hipLaunchKernel(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB, deviceC,
                     deviceD, WIDTH, HEIGHT);
 
diff --git a/tests/src/deviceLib/hip_mbcnt.cpp b/tests/src/deviceLib/hip_mbcnt.cpp
index 9fdf36a1d3..440e54c222 100644
--- a/tests/src/deviceLib/hip_mbcnt.cpp
+++ b/tests/src/deviceLib/hip_mbcnt.cpp
@@ -36,7 +36,7 @@ THE SOFTWARE.
 
 #define HIP_ASSERT(x) (assert((x) == hipSuccess))
 
-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* mbcnt_lo, unsigned int* mbcnt_hi, unsigned int* lane_id) {
+__global__ void HIP_kernel(unsigned int* mbcnt_lo, unsigned int* mbcnt_hi, unsigned int* lane_id) {
     int x = blockDim.x * blockIdx.x + threadIdx.x;
     mbcnt_lo[x] = __mbcnt_lo(0xFFFFFFFF, 0);
     mbcnt_hi[x] = __mbcnt_hi(0xFFFFFFFF, 0);
@@ -70,7 +70,7 @@ int main() {
     HIP_ASSERT(hipMalloc((void**)&device_mbcnt_hi, buffer_size));
     HIP_ASSERT(hipMalloc((void**)&device_lane_id, buffer_size));
 
-    hipLaunchKernel(HIP_kernel, dim3(num_blocks),
+    hipLaunchKernelGGL(HIP_kernel, dim3(num_blocks),
                     dim3(num_threads_per_block), 0, 0, device_mbcnt_lo, device_mbcnt_hi, device_lane_id);
 
     unsigned int* host_mbcnt_lo = (unsigned int*) malloc(buffer_size);
diff --git a/tests/src/deviceLib/hip_popc.cpp b/tests/src/deviceLib/hip_popc.cpp
index b083c225ad..643b1ca783 100644
--- a/tests/src/deviceLib/hip_popc.cpp
+++ b/tests/src/deviceLib/hip_popc.cpp
@@ -58,7 +58,7 @@ unsigned int popcountCPU(T value) {
     return ret;
 }
 
-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b, unsigned int* c,
+__global__ void HIP_kernel(unsigned int* a, unsigned int* b, unsigned int* c,
                            unsigned long long int* d, int width, int height) {
     int x = blockDim.x * blockIdx.x + threadIdx.x;
     int y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -117,7 +117,7 @@ int main() {
         hipMemcpy(deviceD, hostD, NUM * sizeof(unsigned long long int), hipMemcpyHostToDevice));
 
 
-    hipLaunchKernel(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB, deviceC,
                     deviceD, WIDTH, HEIGHT);
 
diff --git a/tests/src/deviceLib/hip_test_ldg.cpp b/tests/src/deviceLib/hip_test_ldg.cpp
index 7274baa92c..3d623f9cc3 100644
--- a/tests/src/deviceLib/hip_test_ldg.cpp
+++ b/tests/src/deviceLib/hip_test_ldg.cpp
@@ -52,7 +52,7 @@ THE SOFTWARE.
 using namespace std;
 
 template <typename T>
-__global__ void vectoradd_float(hipLaunchParm lp, T* a, const T* bm, int width, int height)
+__global__ void vectoradd_float(T* a, const T* bm, int width, int height)
 
 {
     int x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -120,7 +120,7 @@ bool dataTypesRun() {
     HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice));
 
 
-    hipLaunchKernel(vectoradd_float,
+    hipLaunchKernelGGL(vectoradd_float,
                     dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
                     static_cast<const T*>(deviceB), WIDTH, HEIGHT);
@@ -178,7 +178,7 @@ bool dataTypesRun2() {
     HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
 
     HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice));
-    hipLaunchKernel(vectoradd_float,
+    hipLaunchKernelGGL(vectoradd_float,
                     dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
                     static_cast<const T*>(deviceB), WIDTH, HEIGHT);
@@ -236,7 +236,7 @@ bool dataTypesRun4() {
     HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice));
 
 
-    hipLaunchKernel(vectoradd_float,
+    hipLaunchKernelGGL(vectoradd_float,
                     dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                     dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
                     static_cast<const T*>(deviceB), WIDTH, HEIGHT);
diff --git a/tests/src/deviceLib/hip_trig.cpp b/tests/src/deviceLib/hip_trig.cpp
index 7b076065fa..61876c33dc 100644
--- a/tests/src/deviceLib/hip_trig.cpp
+++ b/tests/src/deviceLib/hip_trig.cpp
@@ -39,7 +39,7 @@ THE SOFTWARE.
 
 #define TEST_DEBUG (0)
 
-__global__ void kernel_trig(hipLaunchParm lp, float* In, float* sin_d, float* cos_d, float* tan_d,
+__global__ void kernel_trig(float* In, float* sin_d, float* cos_d, float* tan_d,
                             float* sin_pd, float* cos_pd) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     sin_d[tid] = sinf(In[tid]);
@@ -74,7 +74,7 @@ int main() {
     HIP_ASSERT(hipMalloc((void**)&cos_pd, SIZE));
 
     hipMemcpy(In_d, In, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(kernel_trig, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0,
+    hipLaunchKernelGGL(kernel_trig, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0,
                     In_d, sin_d, cos_d, tan_d,
                     sin_pd, cos_pd);
     hipMemcpy(sin_h, sin_d, SIZE, hipMemcpyDeviceToHost);
diff --git a/tests/src/hipC.c b/tests/src/hipC.c
index efa03bb909..1484adf379 100644
--- a/tests/src/hipC.c
+++ b/tests/src/hipC.c
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #define ITER 1<<20
 #define SIZE 1024*1024*sizeof(int)
 
-__global__ void Iter(hipLaunchParm lp, int *Ad){
+__global__ void Iter(int *Ad){
     int tx = threadIdx.x + blockIdx.x * blockDim.x;
     if(tx == 0){
         for(int i=0;i<ITER;i++){
@@ -49,7 +49,7 @@ int main(){
     dim3 dimGrid, dimBlock;
     dimGrid.x = 1, dimGrid.y =1, dimGrid.z = 1;
     dimBlock.x = 1, dimBlock.y = 1, dimGrid.z = 1;
-    hipLaunchKernel(HIP_KERNEL_NAME(Iter), dimGrid, dimBlock, 0, 0, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Iter), dimGrid, dimBlock, 0, 0, Ad);
     hipMemcpy(&A, Ad, SIZE, hipMemcpyDeviceToHost);
     passed();
 }
diff --git a/tests/src/kernel/hipDynamicShared.cpp b/tests/src/kernel/hipDynamicShared.cpp
index c803279977..8edd94b48c 100644
--- a/tests/src/kernel/hipDynamicShared.cpp
+++ b/tests/src/kernel/hipDynamicShared.cpp
@@ -30,7 +30,7 @@ THE SOFTWARE.
 #include "test_common.h"
 
 template <typename T>
-__global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T* B_d, T* C_d,
+__global__ void testExternSharedKernel(const T* A_d, const T* B_d, T* C_d,
                                        size_t numElements, size_t groupElements) {
     // declare dynamic shared memory
 #if defined(__HIP_PLATFORM_HCC__)
@@ -114,7 +114,7 @@ void testExternShared(size_t N, size_t groupElements) {
     size_t groupMemBytes = groupElements * sizeof(T);
 
     // launch kernel with dynamic shared memory
-    hipLaunchKernel(HIP_KERNEL_NAME(testExternSharedKernel<T>), dim3(blocks), dim3(threadsPerBlock),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(testExternSharedKernel<T>), dim3(blocks), dim3(threadsPerBlock),
                     groupMemBytes, 0, A_d, B_d, C_d, N, groupElements);
 
     HIPCHECK(hipDeviceSynchronize());
diff --git a/tests/src/kernel/hipDynamicShared2.cpp b/tests/src/kernel/hipDynamicShared2.cpp
index 8ea097f5bd..0e85c1a8c0 100644
--- a/tests/src/kernel/hipDynamicShared2.cpp
+++ b/tests/src/kernel/hipDynamicShared2.cpp
@@ -32,7 +32,7 @@ THE SOFTWARE.
 #define LEN 16 * 1024
 #define SIZE LEN * 4
 
-__global__ void vectorAdd(hipLaunchParm lp, float* Ad, float* Bd) {
+__global__ void vectorAdd(float* Ad, float* Bd) {
     HIP_DYNAMIC_SHARED(float, sBd);
     int tx = threadIdx.x;
     for (int i = 0; i < LEN / 64; i++) {
@@ -53,7 +53,7 @@ int main() {
     hipMalloc(&Bd, SIZE);
     hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
     hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(vectorAdd, dim3(1, 1, 1), dim3(64, 1, 1), SIZE, 0, Ad, Bd);
+    hipLaunchKernelGGL(vectorAdd, dim3(1, 1, 1), dim3(64, 1, 1), SIZE, 0, Ad, Bd);
     hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
     for (int i = 0; i < LEN; i++) {
         assert(B[i] > 1.0f && B[i] < 3.0f);
diff --git a/tests/src/kernel/hipEmptyKernel.cpp b/tests/src/kernel/hipEmptyKernel.cpp
index 6fac01013e..3a42b33072 100644
--- a/tests/src/kernel/hipEmptyKernel.cpp
+++ b/tests/src/kernel/hipEmptyKernel.cpp
@@ -25,10 +25,10 @@ THE SOFTWARE.
 
 #include "test_common.h"
 
-__global__ void Empty(hipLaunchParm lp, int param) {}
+__global__ void Empty(int param) {}
 
 int main() {
-    hipLaunchKernel(HIP_KERNEL_NAME(Empty), dim3(1), dim3(1), 0, 0, 0);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Empty), dim3(1), dim3(1), 0, 0, 0);
     hipDeviceSynchronize();
     passed();
 }
diff --git a/tests/src/kernel/hipGridLaunch.cpp b/tests/src/kernel/hipGridLaunch.cpp
index e7c6ef4718..8505a0758f 100644
--- a/tests/src/kernel/hipGridLaunch.cpp
+++ b/tests/src/kernel/hipGridLaunch.cpp
@@ -37,7 +37,7 @@ __device__ int foo(int i) { return i + 1; }
 //---
 // Syntax we would like to support with GRID_LAUNCH enabled:
 template <typename T>
-__global__ void vectorADD2(hipLaunchParm lp, T* A_d, T* B_d, T* C_d, size_t N) {
+__global__ void vectorADD2(T* A_d, T* B_d, T* C_d, size_t N) {
     size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
     size_t stride = blockDim.x * gridDim.x;
 
@@ -63,7 +63,7 @@ int test_gl2(size_t N) {
     HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
     HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
 
-    hipLaunchKernel(vectorADD2, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
+    hipLaunchKernelGGL(vectorADD2, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
 
     HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
 
diff --git a/tests/src/kernel/hipLanguageExtensions.cpp b/tests/src/kernel/hipLanguageExtensions.cpp
index 36b79965ca..63655e5092 100644
--- a/tests/src/kernel/hipLanguageExtensions.cpp
+++ b/tests/src/kernel/hipLanguageExtensions.cpp
@@ -31,7 +31,7 @@ THE SOFTWARE.
 #include <test_common.h>
 
 #ifdef __HCC__
-#include <amp.h>
+#include <hc.hpp>
 #endif
 
 // cudaA
@@ -53,7 +53,7 @@ __device__ __forceinline__ int sum1_forceinline(int a) { return a + 1; };
 
 __device__ __host__ float PlusOne(float x) { return x + 1.0; }
 
-__global__ void MyKernel(const hipLaunchParm lp, const float* a, const float* b, float* c,
+__global__ void MyKernel(const float* a, const float* b, float* c,
                          unsigned N) {
     // KERNELBEGIN;
 
@@ -71,12 +71,12 @@ void callMyKernel() {
     const unsigned blockSize = 256;
     unsigned N = blockSize;
 
-    hipLaunchKernel(MyKernel, dim3(N / blockSize), dim3(blockSize), 0, 0, a, b, c, N);
+    hipLaunchKernelGGL(MyKernel, dim3(N / blockSize), dim3(blockSize), 0, 0, a, b, c, N);
 }
 
 
 template <typename T>
-__global__ void vectorADD(const hipLaunchParm lp, T __restrict__* A_d, T* B_d, T* C_d, size_t N) {
+__global__ void vectorADD(T __restrict__* A_d, T* B_d, T* C_d, size_t N) {
 //    KERNELBEGIN;
 #ifdef NOT_YET
     int a = __shfl_up(x, 1);
@@ -93,11 +93,7 @@ __global__ void vectorADD(const hipLaunchParm lp, T __restrict__* A_d, T* B_d, T
     int b = threadIdx.x;
     int c;
 
-    // TODO - move to HIP atomics when ready.
-    concurrency ::atomic_fetch_add(&c, b);
-    // Concurrency::atomic_add_unsigned (&x, a);
-
-    // concurrency ::atomic_add_ (x, a);
+    atomicAdd(&c, b);
 #endif
 
     __syncthreads();
diff --git a/tests/src/kernel/hipLaunchParm.cpp b/tests/src/kernel/hipLaunchParm.cpp
index 18c882c86c..aa9ccdf360 100644
--- a/tests/src/kernel/hipLaunchParm.cpp
+++ b/tests/src/kernel/hipLaunchParm.cpp
@@ -916,7 +916,7 @@ int main() {
     hipLaunchKernelGGL(HIP_KERNEL_NAME(vAdd), dim3(1024), 1, 0, 0, Ad);
     hipLaunchKernelGGL(HIP_KERNEL_NAME(vAdd), dim3(1024), dim3(1), 0, 0, Ad);
 
-    // Test: Passing hipLaunchKernel inside another macro:
+    // Test: Passing hipLaunchKernelGGL inside another macro:
     float e0;
     GPU_PRINT_TIME(hipLaunchKernelGGL(vAdd, dim3(1024),
                    dim3(1), 0, 0, Ad), e0, j);
@@ -924,7 +924,7 @@ int main() {
                    dim3(1), 0, 0, Ad)), e0, j);
 
 #ifdef EXTRA_PARENS_1
-    // Don't wrap hipLaunchKernel in extra set of parens:
+    // Don't wrap hipLaunchKernelGGL in extra set of parens:
     GPU_PRINT_TIME((hipLaunchKernelGGL(vAdd, dim3(1024),
                     dim3(1), 0, 0, Ad)), e0, j);
 #endif
diff --git a/tests/src/kernel/hipPrintfKernel.cpp b/tests/src/kernel/hipPrintfKernel.cpp
index fd9265c891..9b798de895 100644
--- a/tests/src/kernel/hipPrintfKernel.cpp
+++ b/tests/src/kernel/hipPrintfKernel.cpp
@@ -27,10 +27,10 @@ THE SOFTWARE.
 
 #include "test_common.h"
 
-__global__ void run_printf(hipLaunchParm lp) { printf("Hello World\n"); }
+__global__ void run_printf() { printf("Hello World\n"); }
 
 int main() {
-    hipLaunchKernel(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0);
     hipDeviceSynchronize();
     passed();
 }
diff --git a/tests/src/kernel/hipTestConstant.cpp b/tests/src/kernel/hipTestConstant.cpp
index cf8b5260f6..b114f8105f 100644
--- a/tests/src/kernel/hipTestConstant.cpp
+++ b/tests/src/kernel/hipTestConstant.cpp
@@ -35,7 +35,7 @@ THE SOFTWARE.
 
 __constant__ int Value[LEN];
 
-__global__ void Get(hipLaunchParm lp, int* Ad) {
+__global__ void Get(int* Ad) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     Ad[tid] = Value[tid];
 }
@@ -52,7 +52,7 @@ int main() {
     HIP_ASSERT(hipMalloc((void**)&Ad, SIZE));
 
     HIP_ASSERT(hipMemcpyToSymbol(HIP_SYMBOL(Value), A, SIZE, 0, hipMemcpyHostToDevice));
-    hipLaunchKernel(Get, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
+    hipLaunchKernelGGL(Get, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
     HIP_ASSERT(hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost));
 
     for (unsigned i = 0; i < LEN; i++) {
diff --git a/tests/src/kernel/hipTestMallocKernel.cpp b/tests/src/kernel/hipTestMallocKernel.cpp
index 037f2f271f..d8084f8883 100644
--- a/tests/src/kernel/hipTestMallocKernel.cpp
+++ b/tests/src/kernel/hipTestMallocKernel.cpp
@@ -32,12 +32,12 @@ THE SOFTWARE.
 #define NUM 1024
 #define SIZE NUM * 8
 
-__global__ void Alloc(hipLaunchParm lp, uint64_t* Ptr) {
+__global__ void Alloc(uint64_t* Ptr) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     Ptr[tid] = (uint64_t)malloc(128);
 }
 
-__global__ void Free(hipLaunchParm lp, uint64_t* Ptr) {
+__global__ void Free(uint64_t* Ptr) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     free((void*)Ptr[tid]);
 }
@@ -54,10 +54,10 @@ int main() {
         HIP_ASSERT(hipSetDevice(i));
         HIP_ASSERT(hipMalloc((void**)&dPtr, SIZE));
         HIP_ASSERT(hipMemcpy(dPtr, hPtr, SIZE, hipMemcpyHostToDevice));
-        hipLaunchKernel(Alloc, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, dPtr);
+        hipLaunchKernelGGL(Alloc, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, dPtr);
         HIP_ASSERT(hipMemcpy(hPtr, dPtr, SIZE, hipMemcpyDeviceToHost));
         assert(hPtr[0] != 0);
-        hipLaunchKernel(Free, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, dPtr);
+        hipLaunchKernelGGL(Free, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, dPtr);
         HIP_ASSERT(hipFree(dPtr));
         for (uint32_t i = 1; i < NUM; i++) {
             assert(hPtr[i] == hPtr[i - 1] + 4096);
diff --git a/tests/src/kernel/hipTestMemKernel.cpp b/tests/src/kernel/hipTestMemKernel.cpp
index c3b9c4a1e4..8d928d3974 100644
--- a/tests/src/kernel/hipTestMemKernel.cpp
+++ b/tests/src/kernel/hipTestMemKernel.cpp
@@ -34,52 +34,52 @@ THE SOFTWARE.
 #define LEN11 11 * 4
 #define LEN12 12 * 4
 
-__global__ void MemCpy8(hipLaunchParm lp, uint8_t* In, uint8_t* Out) {
+__global__ void MemCpy8(uint8_t* In, uint8_t* Out) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     memcpy(Out + tid * 8, In + tid * 8, 8);
 }
 
-__global__ void MemCpy9(hipLaunchParm lp, uint8_t* In, uint8_t* Out) {
+__global__ void MemCpy9(uint8_t* In, uint8_t* Out) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     memcpy(Out + tid * 9, In + tid * 9, 9);
 }
 
-__global__ void MemCpy10(hipLaunchParm lp, uint8_t* In, uint8_t* Out) {
+__global__ void MemCpy10(uint8_t* In, uint8_t* Out) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     memcpy(Out + tid * 10, In + tid * 10, 10);
 }
 
-__global__ void MemCpy11(hipLaunchParm lp, uint8_t* In, uint8_t* Out) {
+__global__ void MemCpy11(uint8_t* In, uint8_t* Out) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     memcpy(Out + tid * 11, In + tid * 11, 11);
 }
 
-__global__ void MemCpy12(hipLaunchParm lp, uint8_t* In, uint8_t* Out) {
+__global__ void MemCpy12(uint8_t* In, uint8_t* Out) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     memcpy(Out + tid * 12, In + tid * 12, 12);
 }
 
-__global__ void MemSet8(hipLaunchParm lp, uint8_t* In) {
+__global__ void MemSet8(uint8_t* In) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     memset(In + tid * 8, 1, 8);
 }
 
-__global__ void MemSet9(hipLaunchParm lp, uint8_t* In) {
+__global__ void MemSet9(uint8_t* In) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     memset(In + tid * 9, 1, 9);
 }
 
-__global__ void MemSet10(hipLaunchParm lp, uint8_t* In) {
+__global__ void MemSet10(uint8_t* In) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     memset(In + tid * 10, 1, 10);
 }
 
-__global__ void MemSet11(hipLaunchParm lp, uint8_t* In) {
+__global__ void MemSet11(uint8_t* In) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     memset(In + tid * 11, 1, 11);
 }
 
-__global__ void MemSet12(hipLaunchParm lp, uint8_t* In) {
+__global__ void MemSet12(uint8_t* In) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     memset(In + tid * 12, 1, 12);
 }
@@ -98,8 +98,8 @@ int main() {
     hipMalloc((void**)&Bd, LEN8);
     hipMalloc((void**)&Cd, LEN8);
     hipMemcpy(Ad, A, LEN8, hipMemcpyHostToDevice);
-    hipLaunchKernel(MemCpy8, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
-    hipLaunchKernel(MemSet8, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
+    hipLaunchKernelGGL(MemCpy8, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(MemSet8, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
     hipMemcpy(B, Bd, LEN8, hipMemcpyDeviceToHost);
     hipMemcpy(C, Cd, LEN8, hipMemcpyDeviceToHost);
     for (uint32_t i = 0; i < LEN8; i++) {
@@ -126,8 +126,8 @@ int main() {
     hipMalloc((void**)&Bd, LEN9);
     hipMalloc((void**)&Cd, LEN9);
     hipMemcpy(Ad, A, LEN9, hipMemcpyHostToDevice);
-    hipLaunchKernel(MemCpy9, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
-    hipLaunchKernel(MemSet9, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
+    hipLaunchKernelGGL(MemCpy9, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(MemSet9, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
     hipMemcpy(B, Bd, LEN9, hipMemcpyDeviceToHost);
     hipMemcpy(C, Cd, LEN9, hipMemcpyDeviceToHost);
     for (uint32_t i = 0; i < LEN9; i++) {
@@ -154,8 +154,8 @@ int main() {
     hipMalloc((void**)&Bd, LEN10);
     hipMalloc((void**)&Cd, LEN10);
     hipMemcpy(Ad, A, LEN10, hipMemcpyHostToDevice);
-    hipLaunchKernel(MemCpy10, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
-    hipLaunchKernel(MemSet10, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
+    hipLaunchKernelGGL(MemCpy10, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(MemSet10, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
     hipMemcpy(B, Bd, LEN10, hipMemcpyDeviceToHost);
     hipMemcpy(C, Cd, LEN10, hipMemcpyDeviceToHost);
     for (uint32_t i = 0; i < LEN10; i++) {
@@ -182,8 +182,8 @@ int main() {
     hipMalloc((void**)&Bd, LEN11);
     hipMalloc((void**)&Cd, LEN11);
     hipMemcpy(Ad, A, LEN11, hipMemcpyHostToDevice);
-    hipLaunchKernel(MemCpy11, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
-    hipLaunchKernel(MemSet11, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
+    hipLaunchKernelGGL(MemCpy11, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(MemSet11, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
     hipMemcpy(B, Bd, LEN11, hipMemcpyDeviceToHost);
     hipMemcpy(C, Cd, LEN11, hipMemcpyDeviceToHost);
     for (uint32_t i = 0; i < LEN11; i++) {
@@ -210,8 +210,8 @@ int main() {
     hipMalloc((void**)&Bd, LEN12);
     hipMalloc((void**)&Cd, LEN12);
     hipMemcpy(Ad, A, LEN12, hipMemcpyHostToDevice);
-    hipLaunchKernel(MemCpy12, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
-    hipLaunchKernel(MemSet12, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
+    hipLaunchKernelGGL(MemCpy12, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(MemSet12, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
     hipMemcpy(B, Bd, LEN12, hipMemcpyDeviceToHost);
     hipMemcpy(C, Cd, LEN12, hipMemcpyDeviceToHost);
     for (uint32_t i = 0; i < LEN12; i++) {
diff --git a/tests/src/kernel/inline_asm_vadd.cpp b/tests/src/kernel/inline_asm_vadd.cpp
index e16560acc9..1c5a77537b 100644
--- a/tests/src/kernel/inline_asm_vadd.cpp
+++ b/tests/src/kernel/inline_asm_vadd.cpp
@@ -33,7 +33,7 @@ OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWA
 
 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void vadd_asm(hipLaunchParm lp, float* out, float* in) {
+__global__ void vadd_asm(float* out, float* in) {
     int i = blockDim.x * blockIdx.x + threadIdx.x;
 
 #ifdef __HIP_PLATFORM_NVCC__
@@ -82,7 +82,7 @@ int main() {
     hipMemcpy(gpuResultVector, VectorB, NUM * sizeof(float), hipMemcpyHostToDevice);
 
     // Lauching kernel from host
-    hipLaunchKernel(vadd_asm, dim3(NUM / THREADS_PER_BLOCK_X), dim3(THREADS_PER_BLOCK_X), 0, 0,
+    hipLaunchKernelGGL(vadd_asm, dim3(NUM / THREADS_PER_BLOCK_X), dim3(THREADS_PER_BLOCK_X), 0, 0,
                     gpuResultVector, gpuVector);
 
     // Memory transfer from device to host
diff --git a/tests/src/runtimeApi/device/hipDeviceSynchronize.cpp b/tests/src/runtimeApi/device/hipDeviceSynchronize.cpp
index 3de04a2213..30866da654 100644
--- a/tests/src/runtimeApi/device/hipDeviceSynchronize.cpp
+++ b/tests/src/runtimeApi/device/hipDeviceSynchronize.cpp
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #define _SIZE sizeof(int) * 1024 * 1024
 #define NUM_STREAMS 2
 
-__global__ void Iter(hipLaunchParm lp, int* Ad, int num) {
+__global__ void Iter(int* Ad, int num) {
     int tx = threadIdx.x + blockIdx.x * blockDim.x;
     // Kernel loop designed to execute very slowly... ... ...   so we can test timing-related
     // behavior below
@@ -58,7 +58,7 @@ int main() {
         HIPCHECK(hipMemcpyAsync(Ad[i], A[i], _SIZE, hipMemcpyHostToDevice, stream[i]));
     }
     for (int i = 0; i < NUM_STREAMS; i++) {
-        hipLaunchKernel(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, stream[i], Ad[i], 1 << 30);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, stream[i], Ad[i], 1 << 30);
     }
     for (int i = 0; i < NUM_STREAMS; i++) {
         HIPCHECK(hipMemcpyAsync(A[i], Ad[i], _SIZE, hipMemcpyDeviceToHost, stream[i]));
diff --git a/tests/src/runtimeApi/event/hipEventRecord.cpp b/tests/src/runtimeApi/event/hipEventRecord.cpp
index a9e1ef504b..bceb770fe0 100644
--- a/tests/src/runtimeApi/event/hipEventRecord.cpp
+++ b/tests/src/runtimeApi/event/hipEventRecord.cpp
@@ -66,7 +66,7 @@ int main(int argc, char* argv[]) {
         // Record the start event
         HIPCHECK(hipEventRecord(start, NULL));
 
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                         static_cast<const float*>(A_d), static_cast<const float*>(B_d), C_d, N);
 
 
diff --git a/tests/src/runtimeApi/memory/hipArray.cpp b/tests/src/runtimeApi/memory/hipArray.cpp
index d781ccc0e1..0c211eecfc 100644
--- a/tests/src/runtimeApi/memory/hipArray.cpp
+++ b/tests/src/runtimeApi/memory/hipArray.cpp
@@ -67,7 +67,7 @@ void memcpy2Dtest(size_t numW, size_t numH, bool usePinnedHost) {
     HIPCHECK(hipMemcpy2D(A_d, pitch_A, A_h, width, width, numH, hipMemcpyHostToDevice));
     HIPCHECK(hipMemcpy2D(B_d, pitch_B, B_h, width, width, numH, hipMemcpyHostToDevice));
 
-    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d,
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d,
                     (pitch_C / sizeof(T)) * numH);
 
     HIPCHECK(hipMemcpy2D(C_h, width, C_d, pitch_C, width, numH, hipMemcpyDeviceToHost));
@@ -117,7 +117,7 @@ void memcpyArraytest(size_t numW, size_t numH, bool usePinnedHost, bool usePitch
         HIPCHECK(hipMemcpyToArray(A_d, 0, 0, (void*)A_h, width, hipMemcpyHostToDevice));
         HIPCHECK(hipMemcpyToArray(B_d, 0, 0, (void*)B_h, width, hipMemcpyHostToDevice));
 
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                         (T*)A_d->data, (T*)B_d->data, (T*)C_d->data, numW);
 
         HIPCHECK(hipMemcpy(C_h, C_d->data, width, hipMemcpyDeviceToHost));
@@ -156,7 +156,7 @@ void memcpyArraytest(size_t numW, size_t numH, bool usePinnedHost, bool usePitch
                                         hipMemcpyHostToDevice));
         }
 
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                         (T*)A_d->data, (T*)B_d->data, (T*)C_d->data, numW * numH);
 
         HIPCHECK(hipMemcpy2D((void*)C_h, width, (void*)C_d->data, width, width, numH,
diff --git a/tests/src/runtimeApi/memory/hipHostGetFlags.cpp b/tests/src/runtimeApi/memory/hipHostGetFlags.cpp
index 056ba80ebc..cc93273f8a 100644
--- a/tests/src/runtimeApi/memory/hipHostGetFlags.cpp
+++ b/tests/src/runtimeApi/memory/hipHostGetFlags.cpp
@@ -32,7 +32,7 @@ THE SOFTWARE.
 #define LEN 1024 * 1024
 #define SIZE LEN * sizeof(float)
 
-__global__ void Add(hipLaunchParm lp, float* Ad, float* Bd, float* Cd) {
+__global__ void Add(float* Ad, float* Bd, float* Cd) {
     int tx = threadIdx.x + blockIdx.x * blockDim.x;
     Cd[tx] = Ad[tx] + Bd[tx];
 }
@@ -74,7 +74,7 @@ int main() {
     dim3 dimGrid(LEN / 512, 1, 1);
     dim3 dimBlock(512, 1, 1);
 
-    hipLaunchKernel(HIP_KERNEL_NAME(Add), dimGrid, dimBlock, 0, 0, Ad, Bd, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Add), dimGrid, dimBlock, 0, 0, Ad, Bd, Cd);
 
     HIPCHECK(
         hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost));  // Note this really HostToHost not
diff --git a/tests/src/runtimeApi/memory/hipHostRegister.cpp b/tests/src/runtimeApi/memory/hipHostRegister.cpp
index 2b832a8bc1..ff6ea963e9 100644
--- a/tests/src/runtimeApi/memory/hipHostRegister.cpp
+++ b/tests/src/runtimeApi/memory/hipHostRegister.cpp
@@ -28,7 +28,7 @@ THE SOFTWARE.
 #include "test_common.h"
 #include <malloc.h>
 
-__global__ void Inc(hipLaunchParm lp, float* Ad) {
+__global__ void Inc(float* Ad) {
     int tx = threadIdx.x + blockIdx.x * blockDim.x;
     Ad[tx] = Ad[tx] + float(1);
 }
@@ -99,7 +99,7 @@ int main(int argc, char* argv[]) {
         // Reference the registered device pointer Ad from inside the kernel:
         for (int i = 0; i < num_devices; i++) {
             HIPCHECK(hipSetDevice(i));
-            hipLaunchKernel(Inc, dim3(N / 512), dim3(512), 0, 0, Ad[i]);
+            hipLaunchKernelGGL(Inc, dim3(N / 512), dim3(512), 0, 0, Ad[i]);
 
             HIPCHECK(hipDeviceSynchronize());
         }
diff --git a/tests/src/runtimeApi/memory/hipMemcpy.cpp b/tests/src/runtimeApi/memory/hipMemcpy.cpp
index 6e7a604d9a..d7acc9c177 100644
--- a/tests/src/runtimeApi/memory/hipMemcpy.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpy.cpp
@@ -230,7 +230,7 @@ void memcpytest2(DeviceMemory<T>* dmem, HostMemory<T>* hmem, size_t numElements,
                            useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
     }
 
-    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                     static_cast<const T*>(dmem->A_d()), static_cast<const T*>(dmem->B_d()),
                     dmem->C_d(), numElements);
 
diff --git a/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp b/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp
index 3124402047..f546ae3ffd 100644
--- a/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpyDtoD.cpp
@@ -51,7 +51,7 @@ int main() {
         HIPCHECK(hipSetDevice(0));
         HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
         HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                         static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
         HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
         HIPCHECK(hipDeviceSynchronize());
@@ -62,7 +62,7 @@ int main() {
         HIPCHECK(hipMemcpyDtoD((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes));
         HIPCHECK(hipMemcpyDtoD((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes));
 
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                         static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
         HIPCHECK(hipMemcpyDtoH(C_h, (hipDeviceptr_t)Z_d, Nbytes));
         HIPCHECK(hipDeviceSynchronize());
diff --git a/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp b/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp
index 741f773617..2d1a591e73 100644
--- a/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp
@@ -52,7 +52,7 @@ int main() {
         HIPCHECK(hipSetDevice(0));
         HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
         HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                         static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
         HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
         HIPCHECK(hipDeviceSynchronize());
@@ -63,7 +63,7 @@ int main() {
         HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes, s));
         HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes, s));
 
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                         static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
         HIPCHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, s));
         HIPCHECK(hipStreamSynchronize(s));
diff --git a/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp b/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp
index fc1349a459..ec7f0754d7 100644
--- a/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp
@@ -50,7 +50,7 @@ int main() {
         HIPCHECK(hipSetDevice(0));
         HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
         HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                         static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
         HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
         HIPCHECK(hipDeviceSynchronize());
@@ -62,7 +62,7 @@ int main() {
             Nbytes);  // this call is eqv to hipMemcpy(hipMemcpyD2D) which goes via stg bufs.
         hipMemcpyPeer(Y_d, 1, B_d, 0, Nbytes);
 
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                         static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
         HIPCHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
         HIPCHECK(hipDeviceSynchronize());
diff --git a/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp b/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp
index 850c3332c2..57ec2038f5 100644
--- a/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp
@@ -54,7 +54,7 @@ int main() {
         HIPCHECK(hipSetDevice(0));
         HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
         HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                         static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
         HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
         HIPCHECK(hipDeviceSynchronize());
@@ -65,7 +65,7 @@ int main() {
         HIPCHECK(hipMemcpyPeerAsync(X_d, 1, A_d, 0, Nbytes, s));
         HIPCHECK(hipMemcpyPeerAsync(Y_d, 1, B_d, 0, Nbytes, s));
 
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                         static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
         HIPCHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
         HIPCHECK(hipDeviceSynchronize());
diff --git a/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp b/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp
index 4b5fc33608..265a5ced22 100644
--- a/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp
+++ b/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp
@@ -61,7 +61,7 @@ void simpleTest1() {
     HIPCHECK(memcopy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
     HIPCHECK(memcopy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
 
-    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                     static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
 
     HIPCHECK(memcopy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
diff --git a/tests/src/runtimeApi/memory/hipMemoryAllocateCoherent.cpp b/tests/src/runtimeApi/memory/hipMemoryAllocateCoherent.cpp
index 2aea01cd2a..6f9c583091 100644
--- a/tests/src/runtimeApi/memory/hipMemoryAllocateCoherent.cpp
+++ b/tests/src/runtimeApi/memory/hipMemoryAllocateCoherent.cpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include <cstdio>
 #include "hip/hip_runtime.h"
 
-__global__ void Kernel(hipLaunchParm lp, volatile float* hostRes) {
+__global__ void Kernel(volatile float* hostRes) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     hostRes[tid] = tid + 1;
     __threadfence_system();
@@ -45,7 +45,7 @@ int main() {
     hipHostMalloc((void**)&hostRes, blocks * sizeof(float), hipHostMallocMapped);
     hostRes[0] = 0;
     hostRes[1] = 0;
-    hipLaunchKernel(HIP_KERNEL_NAME(Kernel), dim3(1), dim3(blocks), 0, 0, hostRes);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Kernel), dim3(1), dim3(blocks), 0, 0, hostRes);
     int eleCounter = 0;
     while (eleCounter < blocks) {
         // blocks until the value changes
diff --git a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp
index b6a15da1a5..418a411613 100644
--- a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp
+++ b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp
@@ -82,9 +82,9 @@ void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) {
         // HIPCHECK(hipStreamSynchronize(stream));
 
         // This is the null stream?
-        // hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d,
+        // hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d,
         // C_d, numElements);
-        hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                         static_cast<const T*>(A_d), static_cast<const T*>(B_d), C_d, numElements);
 
         MemTraits<C>::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream);
diff --git a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp
index bcd4bb78ff..7fadc60a05 100644
--- a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp
+++ b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp
@@ -33,7 +33,7 @@ THE SOFTWARE.
 
 
 template <typename T>
-__global__ void Inc(hipLaunchParm lp, T* Array) {
+__global__ void Inc(T* Array) {
     int tx = threadIdx.x + blockIdx.x * blockDim.x;
     Array[tx] = Array[tx] + T(1);
 }
@@ -53,7 +53,7 @@ void run1(size_t size, hipStream_t stream) {
 
     HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream));
     HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream));
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream, Cd);
     HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream));
     HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream));
     HIPCHECK(hipDeviceSynchronize());
@@ -80,8 +80,8 @@ void run(size_t size, hipStream_t stream1, hipStream_t stream2) {
     HIPCHECK(hipMemcpyAsync(Bhh, Ahh, size, hipMemcpyHostToHost, stream2));
     HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream1));
     HIPCHECK(hipMemcpyAsync(Cdd, Bhh, size, hipMemcpyHostToDevice, stream2));
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream1, Cd);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream2, Cdd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream1, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream2, Cdd);
     HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream1));
     HIPCHECK(hipMemcpyAsync(Ddd, Cdd, size, hipMemcpyDeviceToDevice, stream2));
     HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream1));
diff --git a/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp b/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp
index 729a596338..5d3a933a21 100644
--- a/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp
+++ b/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp
@@ -28,7 +28,7 @@ THE SOFTWARE.
 
 const int NN = 1 << 21;
 
-__global__ void kernel(hipLaunchParm lp, float* x, float* y, int n) {
+__global__ void kernel(float* x, float* y, int n) {
     int tid = threadIdx.x;
     if (tid < 1) {
         for (int i = 0; i < n; i++) {
@@ -38,7 +38,7 @@ __global__ void kernel(hipLaunchParm lp, float* x, float* y, int n) {
     }
 }
 
-__global__ void nKernel(hipLaunchParm lp, float* y) {
+__global__ void nKernel(float* y) {
     int tid = threadIdx.x;
     y[tid] = y[tid] + 1.0f;
 }
@@ -55,8 +55,8 @@ int main() {
     for (int i = 0; i < num_streams; i++) {
         HIPCHECK(hipStreamCreate(&streams[i]));
         HIPCHECK(hipMalloc(&data[i], NN * sizeof(float)));
-        hipLaunchKernel(HIP_KERNEL_NAME(kernel), dim3(1), dim3(1), 0, streams[i], data[i], xd, N);
-        hipLaunchKernel(HIP_KERNEL_NAME(nKernel), dim3(1), dim3(1), 0, 0, yd);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(1), dim3(1), 0, streams[i], data[i], xd, N);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(nKernel), dim3(1), dim3(1), 0, 0, yd);
     }
 
     HIPCHECK(hipMemcpy(&x, xd, sizeof(float), hipMemcpyDeviceToHost));
diff --git a/tests/src/runtimeApi/stream/hipAPIStreamEnable.cpp b/tests/src/runtimeApi/stream/hipAPIStreamEnable.cpp
index a06542e13f..b20d95ee3f 100644
--- a/tests/src/runtimeApi/stream/hipAPIStreamEnable.cpp
+++ b/tests/src/runtimeApi/stream/hipAPIStreamEnable.cpp
@@ -30,7 +30,7 @@ THE SOFTWARE.
 
 const int NN = 1 << 21;
 
-__global__ void kernel(hipLaunchParm lp, float* x, float* y, int n) {
+__global__ void kernel(float* x, float* y, int n) {
     int tid = threadIdx.x;
     if (tid < 1) {
         for (int i = 0; i < n; i++) {
@@ -40,7 +40,7 @@ __global__ void kernel(hipLaunchParm lp, float* x, float* y, int n) {
     }
 }
 
-__global__ void nKernel(hipLaunchParm lp, float* y) {
+__global__ void nKernel(float* y) {
     int tid = threadIdx.x;
     y[tid] = y[tid] + 1.0f;
 }
@@ -57,8 +57,8 @@ int main() {
     for (int i = 0; i < num_streams; i++) {
         HIPCHECK(hipStreamCreate(&streams[i]));
         HIPCHECK(hipMalloc(&data[i], NN * sizeof(float)));
-        hipLaunchKernel(HIP_KERNEL_NAME(kernel), dim3(1), dim3(1), 0, streams[i], data[i], xd, N);
-        hipLaunchKernel(HIP_KERNEL_NAME(nKernel), dim3(1), dim3(1), 0, 0, yd);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(1), dim3(1), 0, streams[i], data[i], xd, N);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(nKernel), dim3(1), dim3(1), 0, 0, yd);
     }
 
     HIPCHECK(hipMemcpy(&x, xd, sizeof(float), hipMemcpyDeviceToHost));
diff --git a/tests/src/runtimeApi/stream/hipNullStream.cpp b/tests/src/runtimeApi/stream/hipNullStream.cpp
index 9e6b7d3ff2..ad16455615 100644
--- a/tests/src/runtimeApi/stream/hipNullStream.cpp
+++ b/tests/src/runtimeApi/stream/hipNullStream.cpp
@@ -33,7 +33,7 @@ int p_db = 0;
 
 
 template <typename T>
-__global__ void vectorADDRepeat(hipLaunchParm lp, const T* A_d, const T* B_d, T* C_d, size_t NELEM,
+__global__ void vectorADDRepeat(const T* A_d, const T* B_d, T* C_d, size_t NELEM,
                                 int repeat) {
     size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
     size_t stride = blockDim.x * gridDim.x;
@@ -113,7 +113,7 @@ void Streamer<T>::enqueAsync() {
     printf("testing: %s  numElements=%zu size=%6.2fMB\n", __func__, _numElements,
            _numElements * sizeof(T) / 1024.0 / 1024.0);
     unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
-    hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream,
+    hipLaunchKernelGGL(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream,
                     static_cast<const T*>(_A_d), static_cast<const T*>(_B_d), _C_d, _numElements,
                     p_repeat);
 }
@@ -206,7 +206,7 @@ int main(int argc, char* argv[]) {
 
             // Dispatch to NULL stream, should wait for prior async activity to complete before
             // beginning:
-            hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0,
+            hipLaunchKernelGGL(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0,
                             0 /*nullstream*/, static_cast<const int*>(lastStreamer->_C_d),
                             static_cast<const int*>(lastStreamer->_C_d), nullStreamer->_C_d,
                             numElements, 1 /*repeat*/);
@@ -242,7 +242,7 @@ int main(int argc, char* argv[]) {
 
             // Dispatch to NULL stream, should wait for prior async activity to complete before
             // beginning:
-            hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0,
+            hipLaunchKernelGGL(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0,
                             0 /*nullstream*/, static_cast<const int*>(lastStreamer->_C_d),
                             static_cast<const int*>(lastStreamer->_C_d), nullStreamer->_C_d,
                             numElements, 1 /*repeat*/);
diff --git a/tests/src/runtimeApi/stream/hipStream.h b/tests/src/runtimeApi/stream/hipStream.h
index 23cbf40d7b..a4fd913891 100644
--- a/tests/src/runtimeApi/stream/hipStream.h
+++ b/tests/src/runtimeApi/stream/hipStream.h
@@ -72,7 +72,7 @@ void D2H(T* Dst, T* Src, size_t size) {
 }
 
 template <typename T>
-__global__ void Inc(hipLaunchParm lp, T* In) {
+__global__ void Inc(T* In) {
     int tx = threadIdx.x + blockIdx.x * blockDim.x;
     In[tx] = In[tx] + 1;
 }
diff --git a/tests/src/runtimeApi/stream/hipStreamL5.cpp b/tests/src/runtimeApi/stream/hipStreamL5.cpp
index f373461079..31f8d128c0 100644
--- a/tests/src/runtimeApi/stream/hipStreamL5.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamL5.cpp
@@ -76,7 +76,7 @@ void test12345() {
 
     H2HAsync(Bh, Ah, size, stream);
     H2DAsync(Ad, Bh, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
     D2DAsync(Bd, Ad, size, stream);
     D2HAsync(Ch, Bd, size, stream);
     HIPCHECK(hipDeviceSynchronize());
@@ -111,7 +111,7 @@ void test13452() {
     H2D(Ad, Dh, size);
 
     H2HAsync(Bh, Ah, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
     D2DAsync(Bd, Ad, size, stream);
     D2HAsync(Ch, Bd, size, stream);
     H2DAsync(Cd, Ch, size, stream);
@@ -152,7 +152,7 @@ void test14523() {
     D2DAsync(Bd, Ad, size, stream);
     D2HAsync(Ch, Bd, size, stream);
     H2DAsync(Cd, Ch, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
 
     HIPCHECK(hipDeviceSynchronize());
 
@@ -190,7 +190,7 @@ void test15234() {
     H2HAsync(Bh, Ah, size, stream);
     D2HAsync(Ch, Ad, size, stream);
     H2DAsync(Bd, Ch, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
     D2DAsync(Cd, Bd, size, stream);
 
     D2H(Eh, Cd, size);
@@ -217,7 +217,7 @@ void test23451() {
     setArray(Ah, N, T(1));
 
     H2DAsync(Ad, Ah, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
     D2DAsync(Bd, Ad, size, stream);
     D2HAsync(Bh, Bd, size, stream);
     H2HAsync(Ch, Bh, size, stream);
@@ -254,7 +254,7 @@ void test24513() {
     D2DAsync(Bd, Ad, size, stream);
     D2HAsync(Bh, Bd, size, stream);
     H2HAsync(Ch, Bh, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
     HIPCHECK(hipDeviceSynchronize());
 
     D2H(Eh, Cd, size);
@@ -291,7 +291,7 @@ void test25134() {
     H2DAsync(Ad, Ah, size, stream);
     D2HAsync(Bh, Ad, size, stream);
     H2HAsync(Ch, Bh, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
     D2DAsync(Cd, Bd, size, stream);
 
     D2H(Eh, Cd, size);
@@ -324,7 +324,7 @@ void test21345() {
 
     H2DAsync(Ad, Ah, size, stream);
     H2HAsync(Ch, Bh, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
     D2DAsync(Bd, Ad, size, stream);
     D2HAsync(Dh, Bd, size, stream);
 
@@ -358,7 +358,7 @@ void test34512() {
 
     H2D(Ad, Ah, size);
 
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
     D2DAsync(Bd, Ad, size, stream);
     D2HAsync(Bh, Bd, size, stream);
     H2HAsync(Ch, Bh, size, stream);
@@ -393,7 +393,7 @@ void test35124() {
 
     H2D(Ad, Dh, size);
 
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
     D2HAsync(Ah, Ad, size, stream);
     H2HAsync(Bh, Ah, size, stream);
     H2DAsync(Bd, Bh, size, stream);
@@ -430,7 +430,7 @@ void test31245() {
 
     H2D(Ad, Dh, size);
 
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
     H2HAsync(Bh, Ah, size, stream);
     H2DAsync(Bd, Bh, size, stream);
     D2DAsync(Cd, Bd, size, stream);
@@ -469,7 +469,7 @@ void test32451() {
     setArray(Eh, N, T(2));
 
     H2D(Ad, Eh, size);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
     H2DAsync(Bd, Ah, size, stream);
     D2DAsync(Cd, Bd, size, stream);
     D2HAsync(Bh, Cd, size, stream);
@@ -507,7 +507,7 @@ void test45123() {
     D2HAsync(Ah, Bd, size, stream);
     H2HAsync(Bh, Ah, size, stream);
     H2DAsync(Cd, Bh, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
     D2H(Ch, Cd, size);
     HIPCHECK(hipDeviceSynchronize());
 
@@ -539,7 +539,7 @@ void test41235() {
     D2DAsync(Bd, Ad, size, stream);
     D2HAsync(Ah, Bd, size, stream);
     H2DAsync(Cd, Ah, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
     D2HAsync(Bh, Cd, size, stream);
 
     HIPCHECK(hipDeviceSynchronize());
@@ -574,7 +574,7 @@ void test42351() {
 
     D2DAsync(Bd, Ad, size, stream);
     H2DAsync(Cd, Ah, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
     D2HAsync(Bh, Cd, size, stream);
     H2HAsync(Ch, Bh, size, stream);
 
@@ -609,7 +609,7 @@ void test43512() {
     H2D(Ad, Dh, size);
 
     D2DAsync(Bd, Ad, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
     D2HAsync(Ah, Bd, size, stream);
     H2HAsync(Bh, Ah, size, stream);
     H2DAsync(Cd, Bh, size, stream);
@@ -645,7 +645,7 @@ void test51234() {
     D2HAsync(Ah, Ad, size, stream);
     H2HAsync(Bh, Ah, size, stream);
     H2DAsync(Bd, Bh, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
     D2DAsync(Cd, Bd, size, stream);
 
     D2H(Ch, Cd, size);
@@ -681,7 +681,7 @@ void test52341() {
 
     D2HAsync(Ah, Ad, size, stream);
     H2DAsync(Bd, Ah, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
     D2DAsync(Cd, Bd, size, stream);
     H2HAsync(Ch, Bh, size, stream);
 
@@ -723,7 +723,7 @@ void test53412() {
     H2D(Bd, Eh, size);
 
     D2HAsync(Ah, Ad, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
     D2DAsync(Cd, Bd, size, stream);
     H2HAsync(Ch, Bh, size, stream);
     H2DAsync(Dd, Ch, size, stream);
@@ -770,7 +770,7 @@ void test54123() {
     D2DAsync(Cd, Bd, size, stream);
     H2HAsync(Ch, Bh, size, stream);
     H2DAsync(Dd, Ch, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Dd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Dd);
 
     D2H(Fh, Cd, size);
     D2H(Gh, Dd, size);
diff --git a/tests/src/test_common.h b/tests/src/test_common.h
index b7b8a3c94e..40f6fffd8e 100644
--- a/tests/src/test_common.h
+++ b/tests/src/test_common.h
@@ -121,7 +121,7 @@ unsigned setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N);
 
 
 template <typename T>
-__global__ void vectorADD(hipLaunchParm lp, const T* A_d, const T* B_d, T* C_d, size_t NELEM) {
+__global__ void vectorADD(const T* A_d, const T* B_d, T* C_d, size_t NELEM) {
     size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
     size_t stride = blockDim.x * gridDim.x;
 
@@ -132,7 +132,7 @@ __global__ void vectorADD(hipLaunchParm lp, const T* A_d, const T* B_d, T* C_d,
 
 
 template <typename T>
-__global__ void vectorADDReverse(hipLaunchParm lp, const T* A_d, const T* B_d, T* C_d,
+__global__ void vectorADDReverse(const T* A_d, const T* B_d, T* C_d,
                                  size_t NELEM) {
     size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
     size_t stride = blockDim.x * gridDim.x;