Replace hipLaunchKernel -> hipLaunchKernelGGL

Change-Id: I4d99009e1199811d417becf1e1b934ec4d4e30be
2018-10-17 12:01:44 +05:30
commit e1fe095471
@@ -38,7 +38,7 @@ THE SOFTWARE.
        }                                                                                          \
    }

-__global__ void bit_extract_kernel(hipLaunchParm lp, uint32_t* C_d, const uint32_t* A_d, size_t N) {
+__global__ void bit_extract_kernel(uint32_t* C_d, const uint32_t* A_d, size_t N) {
    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
    size_t stride = hipBlockDim_x * hipGridDim_x;

@@ -85,7 +85,7 @@ int main(int argc, char* argv[]) {
    printf("info: launch 'bit_extract_kernel' \n");
    const unsigned blocks = 512;
    const unsigned threadsPerBlock = 256;
-    hipLaunchKernel(bit_extract_kernel, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
+    hipLaunchKernelGGL(bit_extract_kernel, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);

    printf("info: copy Device2Host\n");
    CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
@@ -22,7 +22,7 @@ THE SOFTWARE.

 #include "hip/hip_runtime.h"

-__global__ void vadd_hip(hipLaunchParm lp, const float* a, const float* b, float* c, int N) {
+__global__ void vadd_hip(const float* a, const float* b, float* c, int N) {
    int idx = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);

    if (idx < N) {
@@ -60,7 +60,7 @@ int main(int argc, char* argv[]) {
    // Launch kernel onto default accelerator
    int blockSize = 256;                                      // pick arbitrary block size
    int blocks = (sizeElements + blockSize - 1) / blockSize;  // round up to launch enough blocks
-    hipLaunchKernel(vadd_hip, dim3(blocks), dim3(blockSize), 0, 0, A_d, B_d, C_d, sizeElements);
+    hipLaunchKernelGGL(vadd_hip, dim3(blocks), dim3(blockSize), 0, 0, A_d, B_d, C_d, sizeElements);

    // D2H Copy
    hipMemcpy(C_h, C_d, sizeBytes, hipMemcpyDeviceToHost);
@@ -37,7 +37,7 @@ THE SOFTWARE.
 * Square each element in the array A and write to array C.
 */
 template <typename T>
-__global__ void vector_square(hipLaunchParm lp, T* C_d, const T* A_d, size_t N) {
+__global__ void vector_square(T* C_d, const T* A_d, size_t N) {
    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
    size_t stride = hipBlockDim_x * hipGridDim_x;

@@ -81,7 +81,7 @@ int main(int argc, char* argv[]) {
    const unsigned threadsPerBlock = 256;

    printf("info: launch 'vector_square' kernel\n");
-    hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
+    hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);

    printf("info: copy Device2Host\n");
    CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
@@ -434,7 +434,7 @@ class KernelCommand : public Command {

        switch (_kind) {
            case Null:
-                hipLaunchKernel(NullKernel, dim3(gridX / groupX), dim3(gridX), 0, _stream, nullptr);
+                hipLaunchKernelGGL(NullKernel, dim3(gridX / groupX), dim3(gridX), 0, _stream, nullptr);
                break;
            case VectorAdd:
                assert(0);  // TODO
@@ -1,6 +1,6 @@
 #include "hip/hip_runtime.h"

-extern "C" __global__ void NullKernel(hipLaunchParm lp, float* Ad) {
+extern "C" __global__ void NullKernel(float* Ad) {
    if (Ad) {
        Ad[0] = 42;
    }
@@ -3,7 +3,7 @@
 static const int BLOCKSIZEX = 32;
 static const int BLOCKSIZEY = 16;

-__global__ void fails(hipLaunchParm lp, float* pErrorI) {
+__global__ void fails(float* pErrorI) {
    if (pErrorI != 0) {
        pErrorI[0] = 1;
    }
@@ -14,5 +14,5 @@ int main() {
    dim3 threads(BLOCKSIZEX, BLOCKSIZEY);
    float error;

-    hipLaunchKernel(HIP_KERNEL_NAME(fails), blocks, threads, 0, 0, &error);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(fails), blocks, threads, 0, 0, &error);
 }
@@ -48,7 +48,7 @@ const unsigned p_tests = 0xfffffff;


 // HCC optimizes away fully NULL kernel calls, so run one that is nearly null:
-__global__ void NearlyNull(hipLaunchParm lp, float* Ad) {
+__global__ void NearlyNull(float* Ad) {
    if (Ad) {
        Ad[0] = 42;
    }
@@ -94,14 +94,14 @@ int main() {

    if (p_tests & 0x1) {
        hipEventRecord(start);
-        hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+        hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
        stopTest(start, stop, "FirstKernelLaunch", 1);
    }


    if (p_tests & 0x2) {
        hipEventRecord(start);
-        hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+        hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
        stopTest(start, stop, "SecondKernelLaunch", 1);
    }

@@ -110,7 +110,7 @@ int main() {
        for (int t = 0; t < TEST_ITERS; t++) {
            hipEventRecord(start);
            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
                hipEventRecord(sync);
                hipEventSynchronize(sync);
            }
@@ -123,7 +123,7 @@ int main() {
        for (int t = 0; t < TEST_ITERS; t++) {
            hipEventRecord(start);
            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
                hipEventRecord(sync);
                hipEventSynchronize(sync);
            }
@@ -137,7 +137,7 @@ int main() {
        for (int t = 0; t < TEST_ITERS; t++) {
            hipEventRecord(start);
            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream0, Ad);
            }
            stopTest(start, stop, "NullStreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
        }
@@ -147,7 +147,7 @@ int main() {
        for (int t = 0; t < TEST_ITERS; t++) {
            hipEventRecord(start);
            for (int i = 0; i < DISPATCHES_PER_TEST; i++) {
-                hipLaunchKernel(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
+                hipLaunchKernelGGL(NearlyNull, dim3(NUM_GROUPS), dim3(GROUP_SIZE), 0, stream, Ad);
            }
            stopTest(start, stop, "StreamASyncDispatchNoWait", DISPATCHES_PER_TEST);
        }
@@ -36,8 +36,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -86,7 +85,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                    gpuMatrix, WIDTH);

@@ -21,8 +21,7 @@ In order to use the HIP framework, we need to add the "hip_runtime.h" header fil
 ## Device-side code
 We will work on device side code first, Here is simple example showing a snippet of HIP device side code:

-`__global__ void matrixTranspose(hipLaunchParm lp,            `
-`                                float *out,                  `
+`__global__ void matrixTranspose(float *out,                  `
 `                                float *in,                   `
 `                                const int width,             `
 `                                const int height)            `
@@ -41,11 +40,9 @@ other function-type qualifiers are:
 `__host__` can combine with `__device__`, in which case the function compiles for both the host and device. These functions cannot use the HIP grid coordinate functions (for example, "hipThreadIdx_x", will talk about it latter). A possible workaround is to pass the necessary coordinate info as an argument to the function.
 `__host__` cannot combine with `__global__`.

-`__global__` functions are often referred to as *kernels, and calling one is termed *launching the kernel*. 
+`__global__` functions are often referred to as *kernels*, and calling one is termed *launching the kernel*. 

-Next keyword is `void`. HIP `__global__` functions must have a `void` return type, and the first parameter to a HIP `__global__` function must have the type `hipLaunchParm`, which is for execution configuration. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.
-
-After `hipLaunchParm`, Kernel arguments follows next(i.e.,  `float *out, float *in, const int width, const int height`).
+Next keyword is `void`. HIP `__global__` functions must have a `void` return type. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute.

 The kernel function begins with 
 `    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;`
@@ -63,15 +60,15 @@ We allocated memory to the Matrix on host side by using malloc and initiallized
 here the first parameter is the destination pointer, second is the source pointer, third is the size of memory copy and the last specify the direction on memory copy(which is in this case froom host to device). While in order to transfer memory from device to host, use `hipMemcpyDeviceToHost` and for device to device memory copy use `hipMemcpyDeviceToDevice`.

 Now, we'll see how to launch the kernel.
-`  hipLaunchKernel(matrixTranspose,                                                         `
+`  hipLaunchKernelGGL(matrixTranspose,                                                      `
 `                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),             `
 `                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),                          `
 `                  0, 0,                                                                    `
 `                  gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT);                          `

 HIP introduces a standard C++ calling convention to pass the execution configuration to the kernel (this convention replaces the `Cuda <<< >>>` syntax). In HIP,
- Kernels launch with the `"hipLaunchKernel"` function
- The first five parameters to hipLaunchKernel are the following:
+- Kernels launch with the `"hipLaunchKernelGGL"` function
+- The first five parameters to hipLaunchKernelGGL are the following:
   - **symbol kernelName**: the name of the kernel to launch.  To support template kernels which contains "," use the HIP_KERNEL_NAME macro. In current application it's "matrixTranspose".
   - **dim3 gridDim**: 3D-grid dimensions specifying the number of blocks to launch. In MatrixTranspose sample, it's "dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y)". 
   - **dim3 blockDim**: 3D-block dimensions specifying the number of threads in each block.In MatrixTranspose sample, it's "dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y)". 
@@ -34,8 +34,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -103,7 +102,7 @@ int main() {
    hipEventRecord(start, NULL);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                    gpuMatrix, WIDTH);

@@ -36,8 +36,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -86,7 +85,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                    gpuMatrix, WIDTH);

@@ -41,7 +41,7 @@ Now, we'll have the operation for which we need to compute the time taken. For t
 `  hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);`

 and for kernel execution time we'll use `hipKernelLaunch`:
-`  hipLaunchKernel(matrixTranspose,                                                    `
+`  hipLaunchKernelGGL(matrixTranspose,                                                    `
 `                  dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),        `
 `                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),                     `
 `                  0, 0,                                                               `
@@ -34,8 +34,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -103,7 +102,7 @@ int main() {
    hipEventRecord(start, NULL);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                    gpuMatrix, WIDTH);

@@ -41,8 +41,7 @@ int startTriggerIteration = -1;
 int stopTriggerIteration = -1;

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;

@@ -98,7 +97,7 @@ void runGPU(float* Matrix, float* TransposeMatrix, float* gpuMatrix, float* gpuT
        hipEventRecord(start, NULL);

        // Lauching kernel from host
-        hipLaunchKernel(matrixTranspose,
+        hipLaunchKernelGGL(matrixTranspose,
                        dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                        dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                        gpuMatrix, WIDTH);
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    __shared__ float sharedMem[WIDTH * WIDTH];

    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
@@ -91,7 +90,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
                    gpuMatrix, WIDTH);

@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;

    float val = in[x];
@@ -88,7 +87,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
+    hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
                    gpuTransposeMatrix, gpuMatrix, WIDTH);

    // Memory transfer from device to host
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
    float val = in[y * width + x];
@@ -86,7 +85,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0,
+    hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0,
                    gpuTransposeMatrix, gpuMatrix, WIDTH);

    // Memory transfer from device to host
@@ -25,7 +25,7 @@ Shared memory is way more faster than that of global and constant memory and acc
 here the first parameter is the data type while the second one is the variable name.

 The other important change is:
-`  hipLaunchKernel(matrixTranspose,                                                   `
+`  hipLaunchKernelGGL(matrixTranspose,                                                   `
                  dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
                  dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                  sizeof(float)*WIDTH*WIDTH, 0,
@@ -33,8 +33,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    // declare dynamic shared memory
    HIP_DYNAMIC_SHARED(float, sharedMem);

@@ -90,7 +89,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
                    0, gpuTransposeMatrix, gpuMatrix, WIDTH);

@@ -26,15 +26,15 @@ and create stream using `hipStreamCreate` as follows:
 `  for(int i=0;i<num_streams;i++)                                                `
 `  hipStreamCreate(&streams[i]);                                                 `

-and while kernel launch, we make the following changes in 5th parameter to hipLaunchKernel(having 0 as the default stream value):
+and while kernel launch, we make the following changes in 5th parameter to hipLaunchKernelGGL(having 0 as the default stream value):

-`  hipLaunchKernel(matrixTranspose_static_shared,                               `
+`  hipLaunchKernelGGL(matrixTranspose_static_shared,                             `
                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                    0, streams[0],
                    gpuTransposeMatrix[0], data[0], width);

-`  hipLaunchKernel(matrixTranspose_dynamic_shared,                              `
+`  hipLaunchKernelGGL(matrixTranspose_dynamic_shared,                            `
                    dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
                    sizeof(float)*WIDTH*WIDTH, streams[1],
@@ -30,7 +30,7 @@ THE SOFTWARE.

 using namespace std;

-__global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_static_shared(float* out, float* in,
                                              const int width) {
    __shared__ float sharedMem[WIDTH * WIDTH];

@@ -44,7 +44,7 @@ __global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, floa
    out[y * width + x] = sharedMem[y * width + x];
 }

-__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_dynamic_shared(float* out, float* in,
                                               const int width) {
    // declare dynamic shared memory
    HIP_DYNAMIC_SHARED(float, sharedMem)
@@ -71,12 +71,12 @@ void MultipleStream(float** data, float* randArray, float** gpuTransposeMatrix,
        hipMemcpyAsync(data[i], randArray, NUM * sizeof(float), hipMemcpyHostToDevice, streams[i]);
    }

-    hipLaunchKernel(matrixTranspose_static_shared,
+    hipLaunchKernelGGL(matrixTranspose_static_shared,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, streams[0],
                    gpuTransposeMatrix[0], data[0], width);

-    hipLaunchKernel(matrixTranspose_dynamic_shared,
+    hipLaunchKernelGGL(matrixTranspose_dynamic_shared,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
                    streams[1], gpuTransposeMatrix[1], data[1], width);
@@ -105,7 +105,7 @@ void disablePeer2Peer(int currentGpu, int peerGpu) {
 }


-__global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_static_shared(float* out, float* in,
                                              const int width) {
    __shared__ float sharedMem[WIDTH * WIDTH];

@@ -119,7 +119,7 @@ __global__ void matrixTranspose_static_shared(hipLaunchParm lp, float* out, floa
    out[y * width + x] = sharedMem[y * width + x];
 }

-__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp, float* out, float* in,
+__global__ void matrixTranspose_dynamic_shared(float* out, float* in,
                                               const int width) {
    // declare dynamic shared memory
    HIP_DYNAMIC_SHARED(float, sharedMem)
@@ -170,7 +170,7 @@ int main() {
    hipMalloc((void**)&data[0], NUM * sizeof(float));
    hipMemcpy(data[0], randArray, NUM * sizeof(float), hipMemcpyHostToDevice);

-    hipLaunchKernel(matrixTranspose_static_shared,
+    hipLaunchKernelGGL(matrixTranspose_static_shared,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix[0],
                    data[0], width);
@@ -181,7 +181,7 @@ int main() {
    hipMalloc((void**)&data[1], NUM * sizeof(float));
    hipMemcpy(data[1], gpuTransposeMatrix[0], NUM * sizeof(float), hipMemcpyDeviceToDevice);

-    hipLaunchKernel(matrixTranspose_dynamic_shared,
+    hipLaunchKernelGGL(matrixTranspose_dynamic_shared,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), sizeof(float) * WIDTH * WIDTH,
                    0, gpuTransposeMatrix[1], data[1], width);
@@ -35,8 +35,7 @@ THE SOFTWARE.
 #define THREADS_PER_BLOCK_Z 1

 // Device (Kernel) function, it must be void
-// hipLaunchParm provides the execution configuration
-__global__ void matrixTranspose(hipLaunchParm lp, float* out, float* in, const int width) {
+__global__ void matrixTranspose(float* out, float* in, const int width) {
    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    float val = in[x];

@@ -88,7 +87,7 @@ int main() {
    hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
+    hipLaunchKernelGGL(matrixTranspose, dim3(1), dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
                    gpuTransposeMatrix, gpuMatrix, WIDTH);

    // Memory transfer from device to host
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #define _SIZE sizeof(int) * 1024 * 1024
 #define NUM_STREAMS 2

-__global__ void Iter(hipLaunchParm lp, int* Ad, int num) {
+__global__ void Iter(int* Ad, int num) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    // Kernel loop designed to execute very slowly... ... ...   so we can test timing-related
    // behavior below
@@ -58,7 +58,7 @@ int main() {
        HIPCHECK(hipMemcpyAsync(Ad[i], A[i], _SIZE, hipMemcpyHostToDevice, stream[i]));
    }
    for (int i = 0; i < NUM_STREAMS; i++) {
-        hipLaunchKernel(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, stream[i], Ad[i], 1 << 30);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, stream[i], Ad[i], 1 << 30);
    }
    for (int i = 0; i < NUM_STREAMS; i++) {
        HIPCHECK(hipMemcpyAsync(A[i], Ad[i], _SIZE, hipMemcpyDeviceToHost, stream[i]));
@@ -14,12 +14,12 @@
 */


-__global__ void cpy(hipLaunchParm lp, uint32_t* Out, uint32_t* In) {
+__global__ void cpy(uint32_t* Out, uint32_t* In) {
    int tx = threadIdx.x;
    memcpy(Out + tx, In + tx, sizeof(uint32_t));
 }

-__global__ void set(hipLaunchParm lp, uint32_t* ptr, uint8_t val, size_t size) {
+__global__ void set(uint32_t* ptr, uint8_t val, size_t size) {
    int tx = threadIdx.x;
    memset(ptr + tx, val, sizeof(uint32_t));
 }
@@ -39,7 +39,7 @@ int main() {
    hipMalloc((void**)&Bd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);

-    hipLaunchKernel(cpy, dim3(1), dim3(LEN), 0, 0, Bd, Ad);
+    hipLaunchKernelGGL(cpy, dim3(1), dim3(LEN), 0, 0, Bd, Ad);

    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    for (int i = LEN - 16; i < LEN; i++) {
@@ -47,7 +47,7 @@ int main() {
            return 0;
        }
    }
-    hipLaunchKernel(set, dim3(1), dim3(LEN), 0, 0, Bd, 0x1, LEN);
+    hipLaunchKernelGGL(set, dim3(1), dim3(LEN), 0, 0, Bd, 0x1, LEN);

    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    for (int i = LEN - 16; i < LEN; i++) {
@@ -64,11 +64,11 @@ __device__ void double_precision_intrinsics() {
    __fma_rz(1.0, 2.0, 3.0);
 }

-__global__ void compileDoublePrecisionIntrinsics(hipLaunchParm lp, int ignored) {
+__global__ void compileDoublePrecisionIntrinsics(int ignored) {
    double_precision_intrinsics();
 }

 int main() {
-    hipLaunchKernel(compileDoublePrecisionIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
+    hipLaunchKernelGGL(compileDoublePrecisionIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
    passed();
 }
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #define SIZE LEN << 2


-__global__ void floatMath(hipLaunchParm lp, float* In, float* Out) {
+__global__ void floatMath(float* In, float* Out) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    Out[tid] = __cosf(In[tid]);
    Out[tid] = __exp10f(Out[tid]);
@@ -57,6 +57,6 @@ int main() {
    float *Ind, *Outd;
    hipMalloc((void**)&Ind, SIZE);
    hipMalloc((void**)&Outd, SIZE);
-    hipLaunchKernel(floatMath, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0, Ind, Outd);
+    hipLaunchKernelGGL(floatMath, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0, Ind, Outd);
    passed();
 }
@@ -62,9 +62,9 @@ __device__ void integer_intrinsics() {
    assert(1);
 }

-__global__ void compileIntegerIntrinsics(hipLaunchParm lp, int ignored) { integer_intrinsics(); }
+__global__ void compileIntegerIntrinsics(int ignored) { integer_intrinsics(); }

 int main() {
-    hipLaunchKernel(compileIntegerIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
+    hipLaunchKernelGGL(compileIntegerIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
    passed();
 }
@@ -31,12 +31,12 @@ THE SOFTWARE.

 #if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__

-__global__ void kernel_abs_int64(hipLaunchParm lp, long long *input, long long *output) {
+__global__ void kernel_abs_int64(long long *input, long long *output) {
    int tx = threadIdx.x;
    output[tx] = abs(input[tx]);
 }

-__global__ void kernel_lgamma_double(hipLaunchParm lp, double *input, double *output) {
+__global__ void kernel_lgamma_double(double *input, double *output) {
    int tx = threadIdx.x;
    output[tx] = lgamma(input[tx]);
 }
@@ -79,7 +79,7 @@ void check_lgamma_double() {
  hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);

  // launch kernel
-  hipLaunchKernel(kernel_lgamma_double, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
+  hipLaunchKernelGGL(kernel_lgamma_double, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);

  // copy outputs from device
  hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
@@ -127,7 +127,7 @@ void check_abs_int64() {
  hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);

  // launch kernel
-  hipLaunchKernel(kernel_abs_int64, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);
+  hipLaunchKernelGGL(kernel_abs_int64, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU);

  // copy outputs from device
  hipMemcpy(outputCPU, outputGPU, memsize, hipMemcpyDeviceToHost);
@@ -79,12 +79,12 @@ __device__ void single_precision_intrinsics() {
 }


-__global__ void compileSinglePrecisionIntrinsics(hipLaunchParm lp, int ignored) {
+__global__ void compileSinglePrecisionIntrinsics(int ignored) {
    single_precision_intrinsics();
 }


 int main() {
-    hipLaunchKernel(compileSinglePrecisionIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
+    hipLaunchKernelGGL(compileSinglePrecisionIntrinsics, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
    passed();
 }
@@ -129,11 +129,11 @@ __device__ void single_precision_math_functions() {
    ynf(1, 1.0f);
 }

-__global__ void compileSinglePrecisionMathOnDevice(hipLaunchParm lp, int ignored) {
+__global__ void compileSinglePrecisionMathOnDevice(int ignored) {
    single_precision_math_functions();
 }

 int main() {
-    hipLaunchKernel(compileSinglePrecisionMathOnDevice, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
+    hipLaunchKernelGGL(compileSinglePrecisionMathOnDevice, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1);
    passed();
 }
@@ -84,7 +84,7 @@ __device__ __host__ std::complex<FloatT> calc(std::complex<FloatT> A,
 }

 template<typename FloatT>
-__global__ void kernel(hipLaunchParm lp, std::complex<FloatT>* A,
+__global__ void kernel(std::complex<FloatT>* A,
                       std::complex<FloatT>* B, std::complex<FloatT>* C,
                       enum CalcKind CK) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -114,7 +114,7 @@ void test() {
    // Run kernel for a calculation kind and verify by comparing with host
    // calculation result. Returns false if fails.
    auto test_fun = [&](enum CalcKind CK) {
-      hipLaunchKernel(kernel<FloatT>, dim3(1), dim3(LEN), 0, 0, Ad, Bd, Cd, CK);
+      hipLaunchKernelGGL(kernel<FloatT>, dim3(1), dim3(LEN), 0, 0, Ad, Bd, Cd, CK);
      hipMemcpy(C, Cd, sizeof(ComplexT)*LEN, hipMemcpyDeviceToHost);
      for (int i = 0; i < LEN; i++) {
        ComplexT Expected = calc(A[i], B[i], CK);
@@ -31,74 +31,74 @@ THE SOFTWARE.
 #define N 512
 #define SIZE N * sizeof(double)

-__global__ void test_sincos(hipLaunchParm lp, double* a, double* b, double* c) {
+__global__ void test_sincos(double* a, double* b, double* c) {
    int tid = threadIdx.x;
    sincos(a[tid], b + tid, c + tid);
 }

-__global__ void test_sincospi(hipLaunchParm lp, double* a, double* b, double* c) {
+__global__ void test_sincospi(double* a, double* b, double* c) {
    int tid = threadIdx.x;
    sincospi(a[tid], b + tid, c + tid);
 }

-__global__ void test_llrint(hipLaunchParm lp, double* a, long long int* b) {
+__global__ void test_llrint(double* a, long long int* b) {
    int tid = threadIdx.x;
    b[tid] = llrint(a[tid]);
 }

-__global__ void test_lrint(hipLaunchParm lp, double* a, long int* b) {
+__global__ void test_lrint(double* a, long int* b) {
    int tid = threadIdx.x;
    b[tid] = lrint(a[tid]);
 }

-__global__ void test_rint(hipLaunchParm lp, double* a, double* b) {
+__global__ void test_rint(double* a, double* b) {
    int tid = threadIdx.x;
    b[tid] = rint(a[tid]);
 }

-__global__ void test_llround(hipLaunchParm lp, double* a, long long int* b) {
+__global__ void test_llround(double* a, long long int* b) {
    int tid = threadIdx.x;
    b[tid] = llround(a[tid]);
 }

-__global__ void test_lround(hipLaunchParm lp, double* a, long int* b) {
+__global__ void test_lround(double* a, long int* b) {
    int tid = threadIdx.x;
    b[tid] = lround(a[tid]);
 }

-__global__ void test_rhypot(hipLaunchParm lp, double* a, double* b, double* c) {
+__global__ void test_rhypot(double* a, double* b, double* c) {
    int tid = threadIdx.x;
    c[tid] = rhypot(a[tid], b[tid]);
 }

-__global__ void test_norm3d(hipLaunchParm lp, double* a, double* b, double* c, double* d) {
+__global__ void test_norm3d(double* a, double* b, double* c, double* d) {
    int tid = threadIdx.x;
    d[tid] = norm3d(a[tid], b[tid], c[tid]);
 }

-__global__ void test_norm4d(hipLaunchParm lp, double* a, double* b, double* c, double* d,
+__global__ void test_norm4d(double* a, double* b, double* c, double* d,
                            double* e) {
    int tid = threadIdx.x;
    e[tid] = norm4d(a[tid], b[tid], c[tid], d[tid]);
 }

-__global__ void test_rnorm3d(hipLaunchParm lp, double* a, double* b, double* c, double* d) {
+__global__ void test_rnorm3d(double* a, double* b, double* c, double* d) {
    int tid = threadIdx.x;
    d[tid] = rnorm3d(a[tid], b[tid], c[tid]);
 }

-__global__ void test_rnorm4d(hipLaunchParm lp, double* a, double* b, double* c, double* d,
+__global__ void test_rnorm4d(double* a, double* b, double* c, double* d,
                             double* e) {
    int tid = threadIdx.x;
    e[tid] = rnorm4d(a[tid], b[tid], c[tid], d[tid]);
 }

-__global__ void test_rnorm(hipLaunchParm lp, double* a, double* b) {
+__global__ void test_rnorm(double* a, double* b) {
    int tid = threadIdx.x;
    b[tid] = rnorm(N, a);
 }

-__global__ void test_erfinv(hipLaunchParm lp, double* a, double* b) {
+__global__ void test_erfinv(double* a, double* b) {
    int tid = threadIdx.x;
    b[tid] = erf(erfinv(a[tid]));
 }
@@ -115,7 +115,7 @@ bool run_sincos() {
    hipMalloc((void**)&Bd, SIZE);
    hipMalloc((void**)&Cd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_sincos, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
+    hipLaunchKernelGGL(test_sincos, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
@@ -157,7 +157,7 @@ bool run_sincospi() {
    hipMalloc((void**)&Bd, SIZE);
    hipMalloc((void**)&Cd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_sincospi, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
+    hipLaunchKernelGGL(test_sincospi, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
@@ -199,7 +199,7 @@ bool run_llrint() {
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, N * sizeof(long long int));
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_llrint, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_llrint, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, N * sizeof(long long int), hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -233,7 +233,7 @@ bool run_lrint() {
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, N * sizeof(long int));
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_lrint, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_lrint, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, N * sizeof(long int), hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -266,7 +266,7 @@ bool run_rint() {
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rint, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_rint, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -300,7 +300,7 @@ bool run_llround() {
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, N * sizeof(long long int));
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_llround, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_llround, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, N * sizeof(long long int), hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -333,7 +333,7 @@ bool run_lround() {
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, N * sizeof(long int));
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_lround, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_lround, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, N * sizeof(long int), hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -376,7 +376,7 @@ bool run_norm3d() {
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_norm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
+    hipLaunchKernelGGL(test_norm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
    hipMemcpy(D, Dd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -425,7 +425,7 @@ bool run_norm4d() {
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Dd, D, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_norm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
+    hipLaunchKernelGGL(test_norm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
    hipMemcpy(E, Ed, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -469,7 +469,7 @@ bool run_rhypot() {
    hipMalloc((void**)&Cd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rhypot, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
+    hipLaunchKernelGGL(test_rhypot, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd);
    hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -512,7 +512,7 @@ bool run_rnorm3d() {
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rnorm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
+    hipLaunchKernelGGL(test_rnorm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd);
    hipMemcpy(D, Dd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -561,7 +561,7 @@ bool run_rnorm4d() {
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Dd, D, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rnorm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
+    hipLaunchKernelGGL(test_rnorm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed);
    hipMemcpy(E, Ed, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -602,7 +602,7 @@ bool run_rnorm() {
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_rnorm, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_rnorm, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -634,7 +634,7 @@ bool run_erfinv() {
    hipMalloc((void**)&Ad, SIZE);
    hipMalloc((void**)&Bd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(test_erfinv, dim3(1), dim3(N), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(test_erfinv, dim3(1), dim3(N), 0, 0, Ad, Bd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    int passed = 0;
    for (int i = 0; i < 512; i++) {
@@ -34,7 +34,7 @@ THE SOFTWARE.
 __device__ int globalIn[NUM];
 __device__ int globalOut[NUM];

-__global__ void Assign(hipLaunchParm lp, int* Out) {
+__global__ void Assign(int* Out) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    Out[tid] = globalIn[tid];
    globalOut[tid] = globalIn[tid];
@@ -63,7 +63,7 @@ int main() {
    hipStreamCreate(&stream);
    hipMemcpyToSymbolAsync(HIP_SYMBOL(globalIn), Am, SIZE, 0, hipMemcpyHostToDevice, stream);
    hipStreamSynchronize(stream);
-    hipLaunchKernel(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
+    hipLaunchKernelGGL(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
    hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost);
    hipMemcpyFromSymbolAsync(Cm, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost, stream);
    hipStreamSynchronize(stream);
@@ -78,7 +78,7 @@ int main() {
    }

    hipMemcpyToSymbol(HIP_SYMBOL(globalIn), A, SIZE, 0, hipMemcpyHostToDevice);
-    hipLaunchKernel(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
+    hipLaunchKernelGGL(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
    hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost);
    hipMemcpyFromSymbol(C, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost);
    for (int i = 0; i < NUM; i++) {
@@ -93,7 +93,7 @@ int main() {

    hipMemcpyToSymbolAsync(HIP_SYMBOL(globalIn), A, SIZE, 0, hipMemcpyHostToDevice, stream);
    hipStreamSynchronize(stream);
-    hipLaunchKernel(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
+    hipLaunchKernelGGL(Assign, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, Ad);
    hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost);
    hipMemcpyFromSymbolAsync(C, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost, stream);
    hipStreamSynchronize(stream);
@@ -31,7 +31,7 @@ THE SOFTWARE.
 #define NUM 1024
 #define SIZE NUM * sizeof(float)

-__global__ void vAdd(hipLaunchParm lp, float* In1, float* In2, float* In3, float* In4, float* Out) {
+__global__ void vAdd(float* In1, float* In2, float* In3, float* In4, float* Out) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    In4[tid] = In1[tid] + In2[tid];
    __threadfence();
@@ -66,7 +66,7 @@ int main() {
    hipMemcpy(In3d, In3, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(In4d, In4, SIZE, hipMemcpyHostToDevice);

-    hipLaunchKernel(vAdd, dim3(32, 1, 1), dim3(32, 1, 1), 0, 0, In1d, In2d, In3d, In4d, Outd);
+    hipLaunchKernelGGL(vAdd, dim3(32, 1, 1), dim3(32, 1, 1), 0, 0, In1d, In2d, In3d, In4d, Outd);
    hipMemcpy(Out, Outd, SIZE, hipMemcpyDeviceToHost);
    assert(Out[10] == 2 * In1[10] + 2 * In2[10] + In3[10]);
    passed();
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #include <hip/device_functions.h>
 #define HIP_ASSERT(x) (assert((x) == hipSuccess))

-__global__ void warpvote(hipLaunchParm lp, int* device_any, int* device_all,
+__global__ void warpvote(int* device_any, int* device_all,
                         int Num_Warps_per_Block, int pshift) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    device_any[threadIdx.x >> pshift] = __any(tid - 77);
@@ -73,7 +73,7 @@ int main(int argc, char* argv[]) {
    HIP_ASSERT(hipMemcpy(device_any, host_any, sizeof(int), hipMemcpyHostToDevice));
    HIP_ASSERT(hipMemcpy(device_all, host_all, sizeof(int), hipMemcpyHostToDevice));

-    hipLaunchKernel(warpvote, dim3(Num_Blocks_per_Grid), dim3(Num_Threads_per_Block), 0, 0,
+    hipLaunchKernelGGL(warpvote, dim3(Num_Blocks_per_Grid), dim3(Num_Threads_per_Block), 0, 0,
                    device_any, device_all, Num_Warps_per_Block, pshift);


@@ -30,7 +30,7 @@ THE SOFTWARE.

 #define HIP_ASSERT(x) (assert((x) == hipSuccess))

-__global__ void gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,
+__global__ void gpu_ballot(unsigned int* device_ballot, int Num_Warps_per_Block,
                           int pshift) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    const unsigned int warp_num = threadIdx.x >> pshift;
@@ -69,7 +69,7 @@ int main(int argc, char* argv[]) {
    HIP_ASSERT(hipMemcpy(device_ballot, host_ballot, Num_Warps_per_Grid * sizeof(unsigned int),
                         hipMemcpyHostToDevice));

-    hipLaunchKernel(gpu_ballot, dim3(Num_Blocks_per_Grid), dim3(Num_Threads_per_Block), 0, 0,
+    hipLaunchKernelGGL(gpu_ballot, dim3(Num_Blocks_per_Grid), dim3(Num_Threads_per_Block), 0, 0,
                    device_ballot, Num_Warps_per_Block, pshift);


@@ -53,8 +53,7 @@ T bit_extract(T src0, unsigned int src1, unsigned int src2) {
  }
 }

-__global__ void HIP_kernel(hipLaunchParm lp,
-                           unsigned int* out32, unsigned int* in32_0,
+__global__ void HIP_kernel(unsigned int* out32, unsigned int* in32_0,
                           unsigned int* in32_1, unsigned int* in32_2,
                           unsigned long long int* out64, unsigned long long int* in64_0,
                           unsigned int* in64_1, unsigned int* in64_2) {
@@ -150,7 +149,7 @@ int main() {
    HIP_ASSERT(hipMemcpy(deviceSrc264, hostSrc264, NUM * sizeof(unsigned int), hipMemcpyHostToDevice));


-    hipLaunchKernel(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block),
+    hipLaunchKernelGGL(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block),
                       0, 0,
                       deviceOut32, deviceSrc032, deviceSrc132, deviceSrc232,
                       deviceOut64, deviceSrc064, deviceSrc164, deviceSrc264);
@@ -50,7 +50,7 @@ T bit_insert(T src0, T src1, unsigned int src2, unsigned int src3) {
  return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
 }

-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* out32,
+__global__ void HIP_kernel(unsigned int* out32,
                           unsigned int* in32_0, unsigned int* in32_1,
                           unsigned int* in32_2, unsigned int* in32_3,
                           unsigned long long int* out64, unsigned long long int* in64_0,
@@ -161,7 +161,7 @@ int main() {
    HIP_ASSERT(hipMemcpy(deviceSrc364, hostSrc364, NUM * sizeof(unsigned int), hipMemcpyHostToDevice));


-    hipLaunchKernel(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block),
+    hipLaunchKernelGGL(HIP_kernel, dim3(num_blocks), dim3(num_threads_per_block),
                       0, 0,
                       deviceOut32, deviceSrc032, deviceSrc132, deviceSrc232, deviceSrc332,
                       deviceOut64, deviceSrc064, deviceSrc164, deviceSrc264, deviceSrc364);
@@ -64,7 +64,7 @@ T bitreverse(T num) {
    return reverse_num;
 }

-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,
+__global__ void HIP_kernel(unsigned int* a, unsigned int* b,
                           unsigned long long int* c, unsigned long long int* d, int width,
                           int height) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -124,7 +124,7 @@ int main() {
        hipMemcpy(deviceD, hostD, NUM * sizeof(unsigned long long int), hipMemcpyHostToDevice));


-    hipLaunchKernel(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB, deviceC,
                    deviceD, WIDTH, HEIGHT);

@@ -82,7 +82,7 @@ __device__ void test_ambiguity() {
  __clzll(ui);
 }

-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b, unsigned int* c,
+__global__ void HIP_kernel(unsigned int* a, unsigned int* b, unsigned int* c,
                           unsigned long long int* d, int width, int height) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -138,7 +138,7 @@ int main() {
    HIP_ASSERT(
        hipMemcpy(deviceD, hostD, NUM * sizeof(unsigned long long int), hipMemcpyHostToDevice));

-    hipLaunchKernel(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB, deviceC,
                    deviceD, WIDTH, HEIGHT);

@@ -59,7 +59,7 @@ int lastbit(T a) {
 }


-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b, unsigned int* c,
+__global__ void HIP_kernel(unsigned int* a, unsigned int* b, unsigned int* c,
                           unsigned long long int* d, int width, int height) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -117,7 +117,7 @@ int main() {
    HIP_ASSERT(
        hipMemcpy(deviceD, hostD, NUM * sizeof(unsigned long long int), hipMemcpyHostToDevice));

-    hipLaunchKernel(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB, deviceC,
                    deviceD, WIDTH, HEIGHT);

@@ -36,7 +36,7 @@ THE SOFTWARE.

 #define HIP_ASSERT(x) (assert((x) == hipSuccess))

-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* mbcnt_lo, unsigned int* mbcnt_hi, unsigned int* lane_id) {
+__global__ void HIP_kernel(unsigned int* mbcnt_lo, unsigned int* mbcnt_hi, unsigned int* lane_id) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    mbcnt_lo[x] = __mbcnt_lo(0xFFFFFFFF, 0);
    mbcnt_hi[x] = __mbcnt_hi(0xFFFFFFFF, 0);
@@ -70,7 +70,7 @@ int main() {
    HIP_ASSERT(hipMalloc((void**)&device_mbcnt_hi, buffer_size));
    HIP_ASSERT(hipMalloc((void**)&device_lane_id, buffer_size));

-    hipLaunchKernel(HIP_kernel, dim3(num_blocks),
+    hipLaunchKernelGGL(HIP_kernel, dim3(num_blocks),
                    dim3(num_threads_per_block), 0, 0, device_mbcnt_lo, device_mbcnt_hi, device_lane_id);

    unsigned int* host_mbcnt_lo = (unsigned int*) malloc(buffer_size);
@@ -58,7 +58,7 @@ unsigned int popcountCPU(T value) {
    return ret;
 }

-__global__ void HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b, unsigned int* c,
+__global__ void HIP_kernel(unsigned int* a, unsigned int* b, unsigned int* c,
                           unsigned long long int* d, int width, int height) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -117,7 +117,7 @@ int main() {
        hipMemcpy(deviceD, hostD, NUM * sizeof(unsigned long long int), hipMemcpyHostToDevice));


-    hipLaunchKernel(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
+    hipLaunchKernelGGL(HIP_kernel, dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB, deviceC,
                    deviceD, WIDTH, HEIGHT);

@@ -52,7 +52,7 @@ THE SOFTWARE.
 using namespace std;

 template <typename T>
-__global__ void vectoradd_float(hipLaunchParm lp, T* a, const T* bm, int width, int height)
+__global__ void vectoradd_float(T* a, const T* bm, int width, int height)

 {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -120,7 +120,7 @@ bool dataTypesRun() {
    HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice));


-    hipLaunchKernel(vectoradd_float,
+    hipLaunchKernelGGL(vectoradd_float,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
                    static_cast<const T*>(deviceB), WIDTH, HEIGHT);
@@ -178,7 +178,7 @@ bool dataTypesRun2() {
    HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));

    HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice));
-    hipLaunchKernel(vectoradd_float,
+    hipLaunchKernelGGL(vectoradd_float,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
                    static_cast<const T*>(deviceB), WIDTH, HEIGHT);
@@ -236,7 +236,7 @@ bool dataTypesRun4() {
    HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM * sizeof(T), hipMemcpyHostToDevice));


-    hipLaunchKernel(vectoradd_float,
+    hipLaunchKernelGGL(vectoradd_float,
                    dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA,
                    static_cast<const T*>(deviceB), WIDTH, HEIGHT);
@@ -39,7 +39,7 @@ THE SOFTWARE.

 #define TEST_DEBUG (0)

-__global__ void kernel_trig(hipLaunchParm lp, float* In, float* sin_d, float* cos_d, float* tan_d,
+__global__ void kernel_trig(float* In, float* sin_d, float* cos_d, float* tan_d,
                            float* sin_pd, float* cos_pd) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    sin_d[tid] = sinf(In[tid]);
@@ -74,7 +74,7 @@ int main() {
    HIP_ASSERT(hipMalloc((void**)&cos_pd, SIZE));

    hipMemcpy(In_d, In, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(kernel_trig, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0,
+    hipLaunchKernelGGL(kernel_trig, dim3(LEN, 1, 1), dim3(1, 1, 1), 0, 0,
                    In_d, sin_d, cos_d, tan_d,
                    sin_pd, cos_pd);
    hipMemcpy(sin_h, sin_d, SIZE, hipMemcpyDeviceToHost);
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #define ITER 1<<20
 #define SIZE 1024*1024*sizeof(int)

-__global__ void Iter(hipLaunchParm lp, int *Ad){
+__global__ void Iter(int *Ad){
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    if(tx == 0){
        for(int i=0;i<ITER;i++){
@@ -49,7 +49,7 @@ int main(){
    dim3 dimGrid, dimBlock;
    dimGrid.x = 1, dimGrid.y =1, dimGrid.z = 1;
    dimBlock.x = 1, dimBlock.y = 1, dimGrid.z = 1;
-    hipLaunchKernel(HIP_KERNEL_NAME(Iter), dimGrid, dimBlock, 0, 0, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Iter), dimGrid, dimBlock, 0, 0, Ad);
    hipMemcpy(&A, Ad, SIZE, hipMemcpyDeviceToHost);
    passed();
 }
@@ -30,7 +30,7 @@ THE SOFTWARE.
 #include "test_common.h"

 template <typename T>
-__global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T* B_d, T* C_d,
+__global__ void testExternSharedKernel(const T* A_d, const T* B_d, T* C_d,
                                       size_t numElements, size_t groupElements) {
    // declare dynamic shared memory
 #if defined(__HIP_PLATFORM_HCC__)
@@ -114,7 +114,7 @@ void testExternShared(size_t N, size_t groupElements) {
    size_t groupMemBytes = groupElements * sizeof(T);

    // launch kernel with dynamic shared memory
-    hipLaunchKernel(HIP_KERNEL_NAME(testExternSharedKernel<T>), dim3(blocks), dim3(threadsPerBlock),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(testExternSharedKernel<T>), dim3(blocks), dim3(threadsPerBlock),
                    groupMemBytes, 0, A_d, B_d, C_d, N, groupElements);

    HIPCHECK(hipDeviceSynchronize());
@@ -32,7 +32,7 @@ THE SOFTWARE.
 #define LEN 16 * 1024
 #define SIZE LEN * 4

-__global__ void vectorAdd(hipLaunchParm lp, float* Ad, float* Bd) {
+__global__ void vectorAdd(float* Ad, float* Bd) {
    HIP_DYNAMIC_SHARED(float, sBd);
    int tx = threadIdx.x;
    for (int i = 0; i < LEN / 64; i++) {
@@ -53,7 +53,7 @@ int main() {
    hipMalloc(&Bd, SIZE);
    hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice);
-    hipLaunchKernel(vectorAdd, dim3(1, 1, 1), dim3(64, 1, 1), SIZE, 0, Ad, Bd);
+    hipLaunchKernelGGL(vectorAdd, dim3(1, 1, 1), dim3(64, 1, 1), SIZE, 0, Ad, Bd);
    hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost);
    for (int i = 0; i < LEN; i++) {
        assert(B[i] > 1.0f && B[i] < 3.0f);
@@ -25,10 +25,10 @@ THE SOFTWARE.

 #include "test_common.h"

-__global__ void Empty(hipLaunchParm lp, int param) {}
+__global__ void Empty(int param) {}

 int main() {
-    hipLaunchKernel(HIP_KERNEL_NAME(Empty), dim3(1), dim3(1), 0, 0, 0);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Empty), dim3(1), dim3(1), 0, 0, 0);
    hipDeviceSynchronize();
    passed();
 }
@@ -37,7 +37,7 @@ __device__ int foo(int i) { return i + 1; }
 //---
 // Syntax we would like to support with GRID_LAUNCH enabled:
 template <typename T>
-__global__ void vectorADD2(hipLaunchParm lp, T* A_d, T* B_d, T* C_d, size_t N) {
+__global__ void vectorADD2(T* A_d, T* B_d, T* C_d, size_t N) {
    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
    size_t stride = blockDim.x * gridDim.x;

@@ -63,7 +63,7 @@ int test_gl2(size_t N) {
    HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
    HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));

-    hipLaunchKernel(vectorADD2, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
+    hipLaunchKernelGGL(vectorADD2, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);

    HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));

@@ -31,7 +31,7 @@ THE SOFTWARE.
 #include <test_common.h>

 #ifdef __HCC__
-#include <amp.h>
+#include <hc.hpp>
 #endif

 // cudaA
@@ -53,7 +53,7 @@ __device__ __forceinline__ int sum1_forceinline(int a) { return a + 1; };

 __device__ __host__ float PlusOne(float x) { return x + 1.0; }

-__global__ void MyKernel(const hipLaunchParm lp, const float* a, const float* b, float* c,
+__global__ void MyKernel(const float* a, const float* b, float* c,
                         unsigned N) {
    // KERNELBEGIN;

@@ -71,12 +71,12 @@ void callMyKernel() {
    const unsigned blockSize = 256;
    unsigned N = blockSize;

-    hipLaunchKernel(MyKernel, dim3(N / blockSize), dim3(blockSize), 0, 0, a, b, c, N);
+    hipLaunchKernelGGL(MyKernel, dim3(N / blockSize), dim3(blockSize), 0, 0, a, b, c, N);
 }


 template <typename T>
-__global__ void vectorADD(const hipLaunchParm lp, T __restrict__* A_d, T* B_d, T* C_d, size_t N) {
+__global__ void vectorADD(T __restrict__* A_d, T* B_d, T* C_d, size_t N) {
 //    KERNELBEGIN;
 #ifdef NOT_YET
    int a = __shfl_up(x, 1);
@@ -93,11 +93,7 @@ __global__ void vectorADD(const hipLaunchParm lp, T __restrict__* A_d, T* B_d, T
    int b = threadIdx.x;
    int c;

-    // TODO - move to HIP atomics when ready.
-    concurrency ::atomic_fetch_add(&c, b);
-    // Concurrency::atomic_add_unsigned (&x, a);
-
-    // concurrency ::atomic_add_ (x, a);
+    atomicAdd(&c, b);
 #endif

    __syncthreads();
@@ -916,7 +916,7 @@ int main() {
    hipLaunchKernelGGL(HIP_KERNEL_NAME(vAdd), dim3(1024), 1, 0, 0, Ad);
    hipLaunchKernelGGL(HIP_KERNEL_NAME(vAdd), dim3(1024), dim3(1), 0, 0, Ad);

-    // Test: Passing hipLaunchKernel inside another macro:
+    // Test: Passing hipLaunchKernelGGL inside another macro:
    float e0;
    GPU_PRINT_TIME(hipLaunchKernelGGL(vAdd, dim3(1024),
                   dim3(1), 0, 0, Ad), e0, j);
@@ -924,7 +924,7 @@ int main() {
                   dim3(1), 0, 0, Ad)), e0, j);

 #ifdef EXTRA_PARENS_1
-    // Don't wrap hipLaunchKernel in extra set of parens:
+    // Don't wrap hipLaunchKernelGGL in extra set of parens:
    GPU_PRINT_TIME((hipLaunchKernelGGL(vAdd, dim3(1024),
                    dim3(1), 0, 0, Ad)), e0, j);
 #endif
@@ -27,10 +27,10 @@ THE SOFTWARE.

 #include "test_common.h"

-__global__ void run_printf(hipLaunchParm lp) { printf("Hello World\n"); }
+__global__ void run_printf() { printf("Hello World\n"); }

 int main() {
-    hipLaunchKernel(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0);
    hipDeviceSynchronize();
    passed();
 }
@@ -35,7 +35,7 @@ THE SOFTWARE.

 __constant__ int Value[LEN];

-__global__ void Get(hipLaunchParm lp, int* Ad) {
+__global__ void Get(int* Ad) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    Ad[tid] = Value[tid];
 }
@@ -52,7 +52,7 @@ int main() {
    HIP_ASSERT(hipMalloc((void**)&Ad, SIZE));

    HIP_ASSERT(hipMemcpyToSymbol(HIP_SYMBOL(Value), A, SIZE, 0, hipMemcpyHostToDevice));
-    hipLaunchKernel(Get, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
+    hipLaunchKernelGGL(Get, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
    HIP_ASSERT(hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost));

    for (unsigned i = 0; i < LEN; i++) {
@@ -32,12 +32,12 @@ THE SOFTWARE.
 #define NUM 1024
 #define SIZE NUM * 8

-__global__ void Alloc(hipLaunchParm lp, uint64_t* Ptr) {
+__global__ void Alloc(uint64_t* Ptr) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    Ptr[tid] = (uint64_t)malloc(128);
 }

-__global__ void Free(hipLaunchParm lp, uint64_t* Ptr) {
+__global__ void Free(uint64_t* Ptr) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    free((void*)Ptr[tid]);
 }
@@ -54,10 +54,10 @@ int main() {
        HIP_ASSERT(hipSetDevice(i));
        HIP_ASSERT(hipMalloc((void**)&dPtr, SIZE));
        HIP_ASSERT(hipMemcpy(dPtr, hPtr, SIZE, hipMemcpyHostToDevice));
-        hipLaunchKernel(Alloc, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, dPtr);
+        hipLaunchKernelGGL(Alloc, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, dPtr);
        HIP_ASSERT(hipMemcpy(hPtr, dPtr, SIZE, hipMemcpyDeviceToHost));
        assert(hPtr[0] != 0);
-        hipLaunchKernel(Free, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, dPtr);
+        hipLaunchKernelGGL(Free, dim3(1, 1, 1), dim3(NUM, 1, 1), 0, 0, dPtr);
        HIP_ASSERT(hipFree(dPtr));
        for (uint32_t i = 1; i < NUM; i++) {
            assert(hPtr[i] == hPtr[i - 1] + 4096);
@@ -34,52 +34,52 @@ THE SOFTWARE.
 #define LEN11 11 * 4
 #define LEN12 12 * 4

-__global__ void MemCpy8(hipLaunchParm lp, uint8_t* In, uint8_t* Out) {
+__global__ void MemCpy8(uint8_t* In, uint8_t* Out) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memcpy(Out + tid * 8, In + tid * 8, 8);
 }

-__global__ void MemCpy9(hipLaunchParm lp, uint8_t* In, uint8_t* Out) {
+__global__ void MemCpy9(uint8_t* In, uint8_t* Out) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memcpy(Out + tid * 9, In + tid * 9, 9);
 }

-__global__ void MemCpy10(hipLaunchParm lp, uint8_t* In, uint8_t* Out) {
+__global__ void MemCpy10(uint8_t* In, uint8_t* Out) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memcpy(Out + tid * 10, In + tid * 10, 10);
 }

-__global__ void MemCpy11(hipLaunchParm lp, uint8_t* In, uint8_t* Out) {
+__global__ void MemCpy11(uint8_t* In, uint8_t* Out) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memcpy(Out + tid * 11, In + tid * 11, 11);
 }

-__global__ void MemCpy12(hipLaunchParm lp, uint8_t* In, uint8_t* Out) {
+__global__ void MemCpy12(uint8_t* In, uint8_t* Out) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memcpy(Out + tid * 12, In + tid * 12, 12);
 }

-__global__ void MemSet8(hipLaunchParm lp, uint8_t* In) {
+__global__ void MemSet8(uint8_t* In) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memset(In + tid * 8, 1, 8);
 }

-__global__ void MemSet9(hipLaunchParm lp, uint8_t* In) {
+__global__ void MemSet9(uint8_t* In) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memset(In + tid * 9, 1, 9);
 }

-__global__ void MemSet10(hipLaunchParm lp, uint8_t* In) {
+__global__ void MemSet10(uint8_t* In) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memset(In + tid * 10, 1, 10);
 }

-__global__ void MemSet11(hipLaunchParm lp, uint8_t* In) {
+__global__ void MemSet11(uint8_t* In) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memset(In + tid * 11, 1, 11);
 }

-__global__ void MemSet12(hipLaunchParm lp, uint8_t* In) {
+__global__ void MemSet12(uint8_t* In) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    memset(In + tid * 12, 1, 12);
 }
@@ -98,8 +98,8 @@ int main() {
    hipMalloc((void**)&Bd, LEN8);
    hipMalloc((void**)&Cd, LEN8);
    hipMemcpy(Ad, A, LEN8, hipMemcpyHostToDevice);
-    hipLaunchKernel(MemCpy8, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
-    hipLaunchKernel(MemSet8, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
+    hipLaunchKernelGGL(MemCpy8, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(MemSet8, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
    hipMemcpy(B, Bd, LEN8, hipMemcpyDeviceToHost);
    hipMemcpy(C, Cd, LEN8, hipMemcpyDeviceToHost);
    for (uint32_t i = 0; i < LEN8; i++) {
@@ -126,8 +126,8 @@ int main() {
    hipMalloc((void**)&Bd, LEN9);
    hipMalloc((void**)&Cd, LEN9);
    hipMemcpy(Ad, A, LEN9, hipMemcpyHostToDevice);
-    hipLaunchKernel(MemCpy9, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
-    hipLaunchKernel(MemSet9, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
+    hipLaunchKernelGGL(MemCpy9, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(MemSet9, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
    hipMemcpy(B, Bd, LEN9, hipMemcpyDeviceToHost);
    hipMemcpy(C, Cd, LEN9, hipMemcpyDeviceToHost);
    for (uint32_t i = 0; i < LEN9; i++) {
@@ -154,8 +154,8 @@ int main() {
    hipMalloc((void**)&Bd, LEN10);
    hipMalloc((void**)&Cd, LEN10);
    hipMemcpy(Ad, A, LEN10, hipMemcpyHostToDevice);
-    hipLaunchKernel(MemCpy10, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
-    hipLaunchKernel(MemSet10, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
+    hipLaunchKernelGGL(MemCpy10, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(MemSet10, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
    hipMemcpy(B, Bd, LEN10, hipMemcpyDeviceToHost);
    hipMemcpy(C, Cd, LEN10, hipMemcpyDeviceToHost);
    for (uint32_t i = 0; i < LEN10; i++) {
@@ -182,8 +182,8 @@ int main() {
    hipMalloc((void**)&Bd, LEN11);
    hipMalloc((void**)&Cd, LEN11);
    hipMemcpy(Ad, A, LEN11, hipMemcpyHostToDevice);
-    hipLaunchKernel(MemCpy11, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
-    hipLaunchKernel(MemSet11, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
+    hipLaunchKernelGGL(MemCpy11, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(MemSet11, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
    hipMemcpy(B, Bd, LEN11, hipMemcpyDeviceToHost);
    hipMemcpy(C, Cd, LEN11, hipMemcpyDeviceToHost);
    for (uint32_t i = 0; i < LEN11; i++) {
@@ -210,8 +210,8 @@ int main() {
    hipMalloc((void**)&Bd, LEN12);
    hipMalloc((void**)&Cd, LEN12);
    hipMemcpy(Ad, A, LEN12, hipMemcpyHostToDevice);
-    hipLaunchKernel(MemCpy12, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
-    hipLaunchKernel(MemSet12, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
+    hipLaunchKernelGGL(MemCpy12, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Ad, Bd);
+    hipLaunchKernelGGL(MemSet12, dim3(2, 1, 1), dim3(2, 1, 1), 0, 0, Cd);
    hipMemcpy(B, Bd, LEN12, hipMemcpyDeviceToHost);
    hipMemcpy(C, Cd, LEN12, hipMemcpyDeviceToHost);
    for (uint32_t i = 0; i < LEN12; i++) {
@@ -33,7 +33,7 @@ OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWA

 // Device (Kernel) function, it must be void
 // hipLaunchParm provides the execution configuration
-__global__ void vadd_asm(hipLaunchParm lp, float* out, float* in) {
+__global__ void vadd_asm(float* out, float* in) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

 #ifdef __HIP_PLATFORM_NVCC__
@@ -82,7 +82,7 @@ int main() {
    hipMemcpy(gpuResultVector, VectorB, NUM * sizeof(float), hipMemcpyHostToDevice);

    // Lauching kernel from host
-    hipLaunchKernel(vadd_asm, dim3(NUM / THREADS_PER_BLOCK_X), dim3(THREADS_PER_BLOCK_X), 0, 0,
+    hipLaunchKernelGGL(vadd_asm, dim3(NUM / THREADS_PER_BLOCK_X), dim3(THREADS_PER_BLOCK_X), 0, 0,
                    gpuResultVector, gpuVector);

    // Memory transfer from device to host
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #define _SIZE sizeof(int) * 1024 * 1024
 #define NUM_STREAMS 2

-__global__ void Iter(hipLaunchParm lp, int* Ad, int num) {
+__global__ void Iter(int* Ad, int num) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    // Kernel loop designed to execute very slowly... ... ...   so we can test timing-related
    // behavior below
@@ -58,7 +58,7 @@ int main() {
        HIPCHECK(hipMemcpyAsync(Ad[i], A[i], _SIZE, hipMemcpyHostToDevice, stream[i]));
    }
    for (int i = 0; i < NUM_STREAMS; i++) {
-        hipLaunchKernel(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, stream[i], Ad[i], 1 << 30);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, stream[i], Ad[i], 1 << 30);
    }
    for (int i = 0; i < NUM_STREAMS; i++) {
        HIPCHECK(hipMemcpyAsync(A[i], Ad[i], _SIZE, hipMemcpyDeviceToHost, stream[i]));
@@ -66,7 +66,7 @@ int main(int argc, char* argv[]) {
        // Record the start event
        HIPCHECK(hipEventRecord(start, NULL));

-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const float*>(A_d), static_cast<const float*>(B_d), C_d, N);


@@ -67,7 +67,7 @@ void memcpy2Dtest(size_t numW, size_t numH, bool usePinnedHost) {
    HIPCHECK(hipMemcpy2D(A_d, pitch_A, A_h, width, width, numH, hipMemcpyHostToDevice));
    HIPCHECK(hipMemcpy2D(B_d, pitch_B, B_h, width, width, numH, hipMemcpyHostToDevice));

-    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d,
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d,
                    (pitch_C / sizeof(T)) * numH);

    HIPCHECK(hipMemcpy2D(C_h, width, C_d, pitch_C, width, numH, hipMemcpyDeviceToHost));
@@ -117,7 +117,7 @@ void memcpyArraytest(size_t numW, size_t numH, bool usePinnedHost, bool usePitch
        HIPCHECK(hipMemcpyToArray(A_d, 0, 0, (void*)A_h, width, hipMemcpyHostToDevice));
        HIPCHECK(hipMemcpyToArray(B_d, 0, 0, (void*)B_h, width, hipMemcpyHostToDevice));

-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        (T*)A_d->data, (T*)B_d->data, (T*)C_d->data, numW);

        HIPCHECK(hipMemcpy(C_h, C_d->data, width, hipMemcpyDeviceToHost));
@@ -156,7 +156,7 @@ void memcpyArraytest(size_t numW, size_t numH, bool usePinnedHost, bool usePitch
                                        hipMemcpyHostToDevice));
        }

-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        (T*)A_d->data, (T*)B_d->data, (T*)C_d->data, numW * numH);

        HIPCHECK(hipMemcpy2D((void*)C_h, width, (void*)C_d->data, width, width, numH,
@@ -32,7 +32,7 @@ THE SOFTWARE.
 #define LEN 1024 * 1024
 #define SIZE LEN * sizeof(float)

-__global__ void Add(hipLaunchParm lp, float* Ad, float* Bd, float* Cd) {
+__global__ void Add(float* Ad, float* Bd, float* Cd) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    Cd[tx] = Ad[tx] + Bd[tx];
 }
@@ -74,7 +74,7 @@ int main() {
    dim3 dimGrid(LEN / 512, 1, 1);
    dim3 dimBlock(512, 1, 1);

-    hipLaunchKernel(HIP_KERNEL_NAME(Add), dimGrid, dimBlock, 0, 0, Ad, Bd, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Add), dimGrid, dimBlock, 0, 0, Ad, Bd, Cd);

    HIPCHECK(
        hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost));  // Note this really HostToHost not
@@ -28,7 +28,7 @@ THE SOFTWARE.
 #include "test_common.h"
 #include <malloc.h>

-__global__ void Inc(hipLaunchParm lp, float* Ad) {
+__global__ void Inc(float* Ad) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    Ad[tx] = Ad[tx] + float(1);
 }
@@ -99,7 +99,7 @@ int main(int argc, char* argv[]) {
        // Reference the registered device pointer Ad from inside the kernel:
        for (int i = 0; i < num_devices; i++) {
            HIPCHECK(hipSetDevice(i));
-            hipLaunchKernel(Inc, dim3(N / 512), dim3(512), 0, 0, Ad[i]);
+            hipLaunchKernelGGL(Inc, dim3(N / 512), dim3(512), 0, 0, Ad[i]);

            HIPCHECK(hipDeviceSynchronize());
        }
@@ -230,7 +230,7 @@ void memcpytest2(DeviceMemory<T>* dmem, HostMemory<T>* hmem, size_t numElements,
                           useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
    }

-    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                    static_cast<const T*>(dmem->A_d()), static_cast<const T*>(dmem->B_d()),
                    dmem->C_d(), numElements);

@@ -51,7 +51,7 @@ int main() {
        HIPCHECK(hipSetDevice(0));
        HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
        HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
        HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
@@ -62,7 +62,7 @@ int main() {
        HIPCHECK(hipMemcpyDtoD((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes));
        HIPCHECK(hipMemcpyDtoD((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes));

-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
        HIPCHECK(hipMemcpyDtoH(C_h, (hipDeviceptr_t)Z_d, Nbytes));
        HIPCHECK(hipDeviceSynchronize());
@@ -52,7 +52,7 @@ int main() {
        HIPCHECK(hipSetDevice(0));
        HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
        HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
        HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
@@ -63,7 +63,7 @@ int main() {
        HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes, s));
        HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes, s));

-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
        HIPCHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, s));
        HIPCHECK(hipStreamSynchronize(s));
@@ -50,7 +50,7 @@ int main() {
        HIPCHECK(hipSetDevice(0));
        HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
        HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
        HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
@@ -62,7 +62,7 @@ int main() {
            Nbytes);  // this call is eqv to hipMemcpy(hipMemcpyD2D) which goes via stg bufs.
        hipMemcpyPeer(Y_d, 1, B_d, 0, Nbytes);

-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
        HIPCHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
@@ -54,7 +54,7 @@ int main() {
        HIPCHECK(hipSetDevice(0));
        HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
        HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
        HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
@@ -65,7 +65,7 @@ int main() {
        HIPCHECK(hipMemcpyPeerAsync(X_d, 1, A_d, 0, Nbytes, s));
        HIPCHECK(hipMemcpyPeerAsync(Y_d, 1, B_d, 0, Nbytes, s));

-        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
        HIPCHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
@@ -61,7 +61,7 @@ void simpleTest1() {
    HIPCHECK(memcopy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
    HIPCHECK(memcopy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));

-    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+    hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                    static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);

    HIPCHECK(memcopy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include <cstdio>
 #include "hip/hip_runtime.h"

-__global__ void Kernel(hipLaunchParm lp, volatile float* hostRes) {
+__global__ void Kernel(volatile float* hostRes) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    hostRes[tid] = tid + 1;
    __threadfence_system();
@@ -45,7 +45,7 @@ int main() {
    hipHostMalloc((void**)&hostRes, blocks * sizeof(float), hipHostMallocMapped);
    hostRes[0] = 0;
    hostRes[1] = 0;
-    hipLaunchKernel(HIP_KERNEL_NAME(Kernel), dim3(1), dim3(blocks), 0, 0, hostRes);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Kernel), dim3(1), dim3(blocks), 0, 0, hostRes);
    int eleCounter = 0;
    while (eleCounter < blocks) {
        // blocks until the value changes
@@ -82,9 +82,9 @@ void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) {
        // HIPCHECK(hipStreamSynchronize(stream));

        // This is the null stream?
-        // hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d,
+        // hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d,
        // C_d, numElements);
-        hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0,
+        hipLaunchKernelGGL(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const T*>(A_d), static_cast<const T*>(B_d), C_d, numElements);

        MemTraits<C>::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream);
@@ -33,7 +33,7 @@ THE SOFTWARE.


 template <typename T>
-__global__ void Inc(hipLaunchParm lp, T* Array) {
+__global__ void Inc(T* Array) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    Array[tx] = Array[tx] + T(1);
 }
@@ -53,7 +53,7 @@ void run1(size_t size, hipStream_t stream) {

    HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream));
    HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream));
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream, Cd);
    HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream));
    HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream));
    HIPCHECK(hipDeviceSynchronize());
@@ -80,8 +80,8 @@ void run(size_t size, hipStream_t stream1, hipStream_t stream2) {
    HIPCHECK(hipMemcpyAsync(Bhh, Ahh, size, hipMemcpyHostToHost, stream2));
    HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream1));
    HIPCHECK(hipMemcpyAsync(Cdd, Bhh, size, hipMemcpyHostToDevice, stream2));
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream1, Cd);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream2, Cdd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream1, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 500), dim3(500), 0, stream2, Cdd);
    HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream1));
    HIPCHECK(hipMemcpyAsync(Ddd, Cdd, size, hipMemcpyDeviceToDevice, stream2));
    HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream1));
@@ -28,7 +28,7 @@ THE SOFTWARE.

 const int NN = 1 << 21;

-__global__ void kernel(hipLaunchParm lp, float* x, float* y, int n) {
+__global__ void kernel(float* x, float* y, int n) {
    int tid = threadIdx.x;
    if (tid < 1) {
        for (int i = 0; i < n; i++) {
@@ -38,7 +38,7 @@ __global__ void kernel(hipLaunchParm lp, float* x, float* y, int n) {
    }
 }

-__global__ void nKernel(hipLaunchParm lp, float* y) {
+__global__ void nKernel(float* y) {
    int tid = threadIdx.x;
    y[tid] = y[tid] + 1.0f;
 }
@@ -55,8 +55,8 @@ int main() {
    for (int i = 0; i < num_streams; i++) {
        HIPCHECK(hipStreamCreate(&streams[i]));
        HIPCHECK(hipMalloc(&data[i], NN * sizeof(float)));
-        hipLaunchKernel(HIP_KERNEL_NAME(kernel), dim3(1), dim3(1), 0, streams[i], data[i], xd, N);
-        hipLaunchKernel(HIP_KERNEL_NAME(nKernel), dim3(1), dim3(1), 0, 0, yd);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(1), dim3(1), 0, streams[i], data[i], xd, N);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(nKernel), dim3(1), dim3(1), 0, 0, yd);
    }

    HIPCHECK(hipMemcpy(&x, xd, sizeof(float), hipMemcpyDeviceToHost));
@@ -30,7 +30,7 @@ THE SOFTWARE.

 const int NN = 1 << 21;

-__global__ void kernel(hipLaunchParm lp, float* x, float* y, int n) {
+__global__ void kernel(float* x, float* y, int n) {
    int tid = threadIdx.x;
    if (tid < 1) {
        for (int i = 0; i < n; i++) {
@@ -40,7 +40,7 @@ __global__ void kernel(hipLaunchParm lp, float* x, float* y, int n) {
    }
 }

-__global__ void nKernel(hipLaunchParm lp, float* y) {
+__global__ void nKernel(float* y) {
    int tid = threadIdx.x;
    y[tid] = y[tid] + 1.0f;
 }
@@ -57,8 +57,8 @@ int main() {
    for (int i = 0; i < num_streams; i++) {
        HIPCHECK(hipStreamCreate(&streams[i]));
        HIPCHECK(hipMalloc(&data[i], NN * sizeof(float)));
-        hipLaunchKernel(HIP_KERNEL_NAME(kernel), dim3(1), dim3(1), 0, streams[i], data[i], xd, N);
-        hipLaunchKernel(HIP_KERNEL_NAME(nKernel), dim3(1), dim3(1), 0, 0, yd);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(1), dim3(1), 0, streams[i], data[i], xd, N);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(nKernel), dim3(1), dim3(1), 0, 0, yd);
    }

    HIPCHECK(hipMemcpy(&x, xd, sizeof(float), hipMemcpyDeviceToHost));
@@ -33,7 +33,7 @@ int p_db = 0;


 template <typename T>
-__global__ void vectorADDRepeat(hipLaunchParm lp, const T* A_d, const T* B_d, T* C_d, size_t NELEM,
+__global__ void vectorADDRepeat(const T* A_d, const T* B_d, T* C_d, size_t NELEM,
                                int repeat) {
    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
    size_t stride = blockDim.x * gridDim.x;
@@ -113,7 +113,7 @@ void Streamer<T>::enqueAsync() {
    printf("testing: %s  numElements=%zu size=%6.2fMB\n", __func__, _numElements,
           _numElements * sizeof(T) / 1024.0 / 1024.0);
    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements);
-    hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream,
+    hipLaunchKernelGGL(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream,
                    static_cast<const T*>(_A_d), static_cast<const T*>(_B_d), _C_d, _numElements,
                    p_repeat);
 }
@@ -206,7 +206,7 @@ int main(int argc, char* argv[]) {

            // Dispatch to NULL stream, should wait for prior async activity to complete before
            // beginning:
-            hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0,
+            hipLaunchKernelGGL(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0,
                            0 /*nullstream*/, static_cast<const int*>(lastStreamer->_C_d),
                            static_cast<const int*>(lastStreamer->_C_d), nullStreamer->_C_d,
                            numElements, 1 /*repeat*/);
@@ -242,7 +242,7 @@ int main(int argc, char* argv[]) {

            // Dispatch to NULL stream, should wait for prior async activity to complete before
            // beginning:
-            hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0,
+            hipLaunchKernelGGL(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0,
                            0 /*nullstream*/, static_cast<const int*>(lastStreamer->_C_d),
                            static_cast<const int*>(lastStreamer->_C_d), nullStreamer->_C_d,
                            numElements, 1 /*repeat*/);
@@ -72,7 +72,7 @@ void D2H(T* Dst, T* Src, size_t size) {
 }

 template <typename T>
-__global__ void Inc(hipLaunchParm lp, T* In) {
+__global__ void Inc(T* In) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    In[tx] = In[tx] + 1;
 }
@@ -76,7 +76,7 @@ void test12345() {

    H2HAsync(Bh, Ah, size, stream);
    H2DAsync(Ad, Bh, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
    D2DAsync(Bd, Ad, size, stream);
    D2HAsync(Ch, Bd, size, stream);
    HIPCHECK(hipDeviceSynchronize());
@@ -111,7 +111,7 @@ void test13452() {
    H2D(Ad, Dh, size);

    H2HAsync(Bh, Ah, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
    D2DAsync(Bd, Ad, size, stream);
    D2HAsync(Ch, Bd, size, stream);
    H2DAsync(Cd, Ch, size, stream);
@@ -152,7 +152,7 @@ void test14523() {
    D2DAsync(Bd, Ad, size, stream);
    D2HAsync(Ch, Bd, size, stream);
    H2DAsync(Cd, Ch, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);

    HIPCHECK(hipDeviceSynchronize());

@@ -190,7 +190,7 @@ void test15234() {
    H2HAsync(Bh, Ah, size, stream);
    D2HAsync(Ch, Ad, size, stream);
    H2DAsync(Bd, Ch, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
    D2DAsync(Cd, Bd, size, stream);

    D2H(Eh, Cd, size);
@@ -217,7 +217,7 @@ void test23451() {
    setArray(Ah, N, T(1));

    H2DAsync(Ad, Ah, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
    D2DAsync(Bd, Ad, size, stream);
    D2HAsync(Bh, Bd, size, stream);
    H2HAsync(Ch, Bh, size, stream);
@@ -254,7 +254,7 @@ void test24513() {
    D2DAsync(Bd, Ad, size, stream);
    D2HAsync(Bh, Bd, size, stream);
    H2HAsync(Ch, Bh, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
    HIPCHECK(hipDeviceSynchronize());

    D2H(Eh, Cd, size);
@@ -291,7 +291,7 @@ void test25134() {
    H2DAsync(Ad, Ah, size, stream);
    D2HAsync(Bh, Ad, size, stream);
    H2HAsync(Ch, Bh, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
    D2DAsync(Cd, Bd, size, stream);

    D2H(Eh, Cd, size);
@@ -324,7 +324,7 @@ void test21345() {

    H2DAsync(Ad, Ah, size, stream);
    H2HAsync(Ch, Bh, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
    D2DAsync(Bd, Ad, size, stream);
    D2HAsync(Dh, Bd, size, stream);

@@ -358,7 +358,7 @@ void test34512() {

    H2D(Ad, Ah, size);

-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
    D2DAsync(Bd, Ad, size, stream);
    D2HAsync(Bh, Bd, size, stream);
    H2HAsync(Ch, Bh, size, stream);
@@ -393,7 +393,7 @@ void test35124() {

    H2D(Ad, Dh, size);

-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
    D2HAsync(Ah, Ad, size, stream);
    H2HAsync(Bh, Ah, size, stream);
    H2DAsync(Bd, Bh, size, stream);
@@ -430,7 +430,7 @@ void test31245() {

    H2D(Ad, Dh, size);

-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
    H2HAsync(Bh, Ah, size, stream);
    H2DAsync(Bd, Bh, size, stream);
    D2DAsync(Cd, Bd, size, stream);
@@ -469,7 +469,7 @@ void test32451() {
    setArray(Eh, N, T(2));

    H2D(Ad, Eh, size);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Ad);
    H2DAsync(Bd, Ah, size, stream);
    D2DAsync(Cd, Bd, size, stream);
    D2HAsync(Bh, Cd, size, stream);
@@ -507,7 +507,7 @@ void test45123() {
    D2HAsync(Ah, Bd, size, stream);
    H2HAsync(Bh, Ah, size, stream);
    H2DAsync(Cd, Bh, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
    D2H(Ch, Cd, size);
    HIPCHECK(hipDeviceSynchronize());

@@ -539,7 +539,7 @@ void test41235() {
    D2DAsync(Bd, Ad, size, stream);
    D2HAsync(Ah, Bd, size, stream);
    H2DAsync(Cd, Ah, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
    D2HAsync(Bh, Cd, size, stream);

    HIPCHECK(hipDeviceSynchronize());
@@ -574,7 +574,7 @@ void test42351() {

    D2DAsync(Bd, Ad, size, stream);
    H2DAsync(Cd, Ah, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Cd);
    D2HAsync(Bh, Cd, size, stream);
    H2HAsync(Ch, Bh, size, stream);

@@ -609,7 +609,7 @@ void test43512() {
    H2D(Ad, Dh, size);

    D2DAsync(Bd, Ad, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
    D2HAsync(Ah, Bd, size, stream);
    H2HAsync(Bh, Ah, size, stream);
    H2DAsync(Cd, Bh, size, stream);
@@ -645,7 +645,7 @@ void test51234() {
    D2HAsync(Ah, Ad, size, stream);
    H2HAsync(Bh, Ah, size, stream);
    H2DAsync(Bd, Bh, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
    D2DAsync(Cd, Bd, size, stream);

    D2H(Ch, Cd, size);
@@ -681,7 +681,7 @@ void test52341() {

    D2HAsync(Ah, Ad, size, stream);
    H2DAsync(Bd, Ah, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
    D2DAsync(Cd, Bd, size, stream);
    H2HAsync(Ch, Bh, size, stream);

@@ -723,7 +723,7 @@ void test53412() {
    H2D(Bd, Eh, size);

    D2HAsync(Ah, Ad, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Bd);
    D2DAsync(Cd, Bd, size, stream);
    H2HAsync(Ch, Bh, size, stream);
    H2DAsync(Dd, Ch, size, stream);
@@ -770,7 +770,7 @@ void test54123() {
    D2DAsync(Cd, Bd, size, stream);
    H2HAsync(Ch, Bh, size, stream);
    H2DAsync(Dd, Ch, size, stream);
-    hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Dd);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(Inc), dim3(N / 512), dim3(512), 0, stream, Dd);

    D2H(Fh, Cd, size);
    D2H(Gh, Dd, size);
@@ -121,7 +121,7 @@ unsigned setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N);


 template <typename T>
-__global__ void vectorADD(hipLaunchParm lp, const T* A_d, const T* B_d, T* C_d, size_t NELEM) {
+__global__ void vectorADD(const T* A_d, const T* B_d, T* C_d, size_t NELEM) {
    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
    size_t stride = blockDim.x * gridDim.x;

@@ -132,7 +132,7 @@ __global__ void vectorADD(hipLaunchParm lp, const T* A_d, const T* B_d, T* C_d,


 template <typename T>
-__global__ void vectorADDReverse(hipLaunchParm lp, const T* A_d, const T* B_d, T* C_d,
+__global__ void vectorADDReverse(const T* A_d, const T* B_d, T* C_d,
                                 size_t NELEM) {
    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
    size_t stride = blockDim.x * gridDim.x;