SWDEV-337452 - Changing Clock64 to WallClock64 in directed tests. (#3140)

Change-Id: I511ab4dcc61daee4fdfbd2a248b5fe74e52174b2
2023-03-21 01:48:32 +05:30
Commit 0ea181501c
@@ -43,6 +43,20 @@ THE SOFTWARE.
      Ad[tid] = clock() + clock64() + __clock() + __clock64() - Ad[tid];
  }

+  static __global__ void kernel1_gfx11(long long* Ad) {
+#ifdef __HIP_PLATFORM_AMD__
+      int tid = threadIdx.x + blockIdx.x * blockDim.x;
+      Ad[tid] = clock() + wall_clock64() + __clock() + __clock64();
+#endif
+  }
+
+  static __global__ void kernel2_gfx11(long long* Ad) {
+#ifdef __HIP_PLATFORM_AMD__
+      int tid = threadIdx.x + blockIdx.x * blockDim.x;
+      Ad[tid] = clock() + wall_clock64() + __clock() + __clock64() - Ad[tid];
+#endif
+  }
+
  void run() {
    long long *A, *Ad;
    A = new long long[LEN];
@@ -50,9 +64,15 @@ THE SOFTWARE.
        A[i] = 0;
    }

+    auto kernel1_used = IsGfx11() ? kernel1_gfx11 : kernel1;
+    auto kernel2_used = IsGfx11() ? kernel2_gfx11 : kernel2;
+
    HIP_ASSERT(hipMalloc((void**)&Ad, SIZE));
-    hipLaunchKernelGGL(kernel1, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
-    hipLaunchKernelGGL(kernel2, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
+
+    hipLaunchKernelGGL(kernel1_used, dim3(1, 1, 1),
+                       dim3(LEN, 1, 1), 0, 0, Ad);
+    hipLaunchKernelGGL(kernel2_used, dim3(1, 1, 1),
+                       dim3(LEN, 1, 1), 0, 0, Ad);
    HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost));

    for (unsigned i = 0; i < LEN; i++) {
@@ -124,6 +124,13 @@ __global__ void test_kernel(long long *array) {
  array[rank] += clock64();
 }

+__global__ void test_kernel_gfx11(long long *array) {
+#ifdef __HIP_PLATFORM_AMD__
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+  array[rank] += wall_clock64();
+#endif
+}
+
 int main(int argc, char** argv) {
  hipError_t err;
  int device_num, FailFlag = 0;
@@ -155,6 +162,7 @@ int main(int argc, char** argv) {
    int num_sms = device_properties.multiProcessorCount;
    int max_num_threads = device_properties.maxThreadsPerBlock;

+    auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
    // Check single-thread block, all numbers of warps, then too-large block
    for (int block_size = 0; block_size <= (max_num_threads + warp_size);
         block_size += warp_size) {
@@ -163,9 +171,8 @@ int main(int argc, char** argv) {
      }
      int max_blocks_per_sm;
      // Calculate the device occupancy to know how many blocks can be run.
-      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-              &max_blocks_per_sm, test_kernel, block_size, 0,
-              hipOccupancyDefault));
+      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&max_blocks_per_sm,
+               test_kernel_used, block_size, 0, hipOccupancyDefault));

      if ((block_size > max_num_threads) && (max_blocks_per_sm != 0)) {
        std::cerr << "ERROR! Occupancy API indicated that we can have >0 ";
@@ -212,7 +219,7 @@ int main(int argc, char** argv) {
        coop_params[i][0] = reinterpret_cast<void*>(&dev_array[i]);
      }

-      err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
+      err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used),
                                       2 * desired_blocks, block_size,
                                       coop_params[0], 0, streams[0]);

@@ -235,9 +242,8 @@ int main(int argc, char** argv) {
      }

      HIPCHECK(hipDeviceSynchronize());
-      err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
-                                       desired_blocks, block_size,
-                                       coop_params[1], 0, streams[1]);
+      err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), desired_blocks,
+                                       block_size, coop_params[1], 0, streams[1]);

      if (expect_fail) {
        expect_to_see = hipErrorInvalidConfiguration;
@@ -106,6 +106,29 @@ __global__ void test_kernel(uint32_t loops, unsigned long long *array, long long
  }
 }

+__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long *array, long long totalTicks) {
+#ifdef __HIP_PLATFORM_AMD__
+  cooperative_groups::thread_block tb = cooperative_groups::this_thread_block();
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int i = 0; i < loops; i++) {
+    long long time_diff = 0;
+    long long last_clock = wall_clock64();
+    do {
+      long long cur_clock = wall_clock64();
+      if (cur_clock > last_clock) {
+        time_diff += (cur_clock - last_clock);
+      }
+      // If it rolls over, we don't know how much to add to catch up.
+      // So just ignore those slipped cycles.
+      last_clock = cur_clock;
+    } while(time_diff < totalTicks);
+    tb.sync();
+    array[rank] += wall_clock64();
+  }
+#endif
+}
+
 template<typename T>
 bool verifyLeastCapacity(T& single_kernel_time, T& double_kernel_time, T& triple_kernel_time)
 {
@@ -256,8 +279,8 @@ int main(int argc, char** argv) {
    long long totalTicks = device_properties.clockRate ;
    int max_blocks_per_sm = 0;
    // Calculate the device occupancy to know how many blocks can be run.
-    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
-                                                          test_kernel,
+    auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, test_kernel_used,
                                                          warp_size, 0));
    int max_active_blocks = max_blocks_per_sm * num_sms;
    int coop_blocks = 0;
@@ -325,9 +348,10 @@ int main(int argc, char** argv) {
    }

    // Verify over capacity
-    HIPCHECK_API(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
+    HIPCHECK_API(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used),
                                     max_active_blocks + 1, warp_size,
-                                     coop_params[0], 0, streams[0]), hipErrorCooperativeLaunchTooLarge);
+                                     coop_params[0], 0, streams[0]),
+                                     hipErrorCooperativeLaunchTooLarge);

    std::cout << "Launching an initial single cooperative kernel..." << std::endl;
    // We need exclude the the initial launching as it will need time to load code obj.
@@ -192,6 +192,20 @@ __global__ void second_test_kernel(long long *array) {
    array[rank] += clock64();
 }

+__global__ void test_kernel_gfx11(long long *array) {
+#ifdef __HIP_PLATFORM_AMD__
+    unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+    array[rank] += wall_clock64();
+#endif
+}
+
+__global__ void second_test_kernel_gfx11(long long *array) {
+#ifdef __HIP_PLATFORM_AMD__
+    unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+    array[rank] += wall_clock64();
+#endif
+}
+
 int main(int argc, char** argv) {
  hipError_t err;
  /*************************************************************************/
@@ -245,8 +259,9 @@ int main(int argc, char** argv) {
    int max_blocks_per_sm = INT_MAX;
    for (int i = 0; i < 2; i++) {
      HIPCHECK(hipSetDevice((dev + i)));
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; 
      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-               &max_blocks_per_sm_arr[i], test_kernel, warp_size, 0));
+               &max_blocks_per_sm_arr[i], test_kernel_used, warp_size, 0));
      if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
          max_blocks_per_sm = max_blocks_per_sm_arr[i];
      }
@@ -293,7 +308,8 @@ int main(int argc, char** argv) {
    for (int i = 0; i < 2; i++) {
      dev_params[i][0] = reinterpret_cast<void*>(&bad_dev_array[i]);

-      md_params[i].func = reinterpret_cast<void*>(test_kernel);
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+      md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
      md_params[i].gridDim = 2 * desired_blocks;
      md_params[i].blockDim = warp_size;
      md_params[i].sharedMem = 0;
@@ -370,7 +386,8 @@ int main(int argc, char** argv) {
        supports_sep_kernels = false;
      }
    }
-    md_params[1].func = reinterpret_cast<void*>(second_test_kernel);
+    auto second_test_kernel_used = IsGfx11() ? second_test_kernel_gfx11 : second_test_kernel;
+    md_params[1].func = reinterpret_cast<void*>(second_test_kernel_used);
    err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
    if ((supports_sep_kernels && err != hipSuccess) ||
        (!supports_sep_kernels && err != hipErrorInvalidValue)) {
@@ -405,7 +422,8 @@ int main(int argc, char** argv) {
    std::cout << "different grid sizes." << std::endl;
    bool supports_sep_sizes = true;
    for (int i = 0; i < 2; i++) {
-      md_params[i].func = reinterpret_cast<void*>(test_kernel);
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+      md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
      md_params[i].gridDim = i+1;
      if (!support_for_separate_grid_sizes((dev + i))) {
        supports_sep_sizes = false;
@@ -195,6 +195,55 @@ __global__ void test_kernel(uint32_t loops, unsigned long long *array) {
  }
 }

+__global__ void test_coop_kernel_gfx11(unsigned int loops, long long *array,
+                                       int fast_gpu) {
+#ifdef __HIP_PLATFORM_AMD__
+  cooperative_groups::multi_grid_group mgrid =
+  cooperative_groups::this_multi_grid();
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (mgrid.grid_rank() == fast_gpu) {
+    return;
+  }
+
+  for (int i = 0; i < loops; i++) {
+    long long time_diff = 0;
+    long long last_clock = wall_clock64();
+    do {
+      long long cur_clock = wall_clock64();
+      if (cur_clock > last_clock) {
+        time_diff += (cur_clock - last_clock);
+      }
+      // If it rolls over, we don't know how much to add to catch up.
+      // So just ignore those slipped cycles.
+      last_clock = cur_clock;
+    } while(time_diff < 1000000);
+    array[rank] += wall_clock64();
+  }
+#endif
+}
+
+__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long *array) {
+#ifdef __HIP_PLATFORM_AMD__
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int i = 0; i < loops; i++) {
+    long long time_diff = 0;
+    long long last_clock = wall_clock64();
+    do {
+      long long cur_clock = wall_clock64();
+      if (cur_clock > last_clock) {
+        time_diff += (cur_clock - last_clock);
+      }
+      // If it rolls over, we don't know how much to add to catch up.
+      // So just ignore those slipped cycles.
+      last_clock = cur_clock;
+    } while(time_diff < 1000000);
+    array[rank] += wall_clock64();
+  }
+#endif
+}
+
 int main(int argc, char** argv) {
  hipError_t err;
  int device_num, FailFlag = 0;
@@ -249,8 +298,9 @@ int main(int argc, char** argv) {
    int max_blocks_per_sm = INT_MAX;
    for (int i = 0; i < 2; i++) {
      HIPCHECK(hipSetDevice(dev + i));
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-               &max_blocks_per_sm_arr[i], test_kernel, warp_size, 0));
+               &max_blocks_per_sm_arr[i], test_kernel_used, warp_size, 0));
      if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
        max_blocks_per_sm = max_blocks_per_sm_arr[i];
      }
@@ -302,11 +352,12 @@ int main(int argc, char** argv) {
    std::cout << "GPU " << dev << ": Long Coop Kernel" << std::endl;
    std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel" << std::endl;

+    auto test_coop_kernel_used = IsGfx11() ? test_coop_kernel_gfx11 : test_coop_kernel;
    for (int i = 0; i < 2; i++) {
      dev_params[i][0] = reinterpret_cast<void*>(&loops);
      dev_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
      dev_params[i][2] = reinterpret_cast<void*>(&fast_gpu);
-      md_params[i].func = reinterpret_cast<void*>(test_coop_kernel);
+      md_params[i].func = reinterpret_cast<void*>(test_coop_kernel_used);
      md_params[i].gridDim = desired_blocks;
      md_params[i].blockDim = warp_size;
      md_params[i].sharedMem = 0;
@@ -331,12 +382,14 @@ int main(int argc, char** argv) {
    fast_gpu = 1;
    start_time[1] = std::chrono::system_clock::now();
    HIPCHECK(hipSetDevice(dev));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                       streams[0], loops, dev_array[0]);
    HIPCHECK(hipGetLastError());
    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
    HIPCHECK(hipSetDevice(dev + 1));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                       streams[1], loops, dev_array[1]);
    HIPCHECK(hipGetLastError());
    for (int i = 0; i < 2; i++) {
@@ -356,12 +409,14 @@ int main(int argc, char** argv) {
    fast_gpu = 0;
    start_time[2] = std::chrono::system_clock::now();
    HIPCHECK(hipSetDevice(dev));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                       streams[0], loops, dev_array[0]);
    HIPCHECK(hipGetLastError());
    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
    HIPCHECK(hipSetDevice(dev + 1));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                       streams[1], loops, dev_array[1]);
    HIPCHECK(hipGetLastError());
    for (int i = 0; i < 2; i++) {
@@ -382,13 +437,15 @@ int main(int argc, char** argv) {
    fast_gpu = 0;
    start_time[3] = std::chrono::system_clock::now();
    HIPCHECK(hipSetDevice(dev));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                       streams[0], loops, dev_array[0]);
    HIPCHECK(hipGetLastError());
    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
             hipCooperativeLaunchMultiDeviceNoPreSync));
    HIPCHECK(hipSetDevice(dev + 1));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                       streams[1], loops, dev_array[1]);
    HIPCHECK(hipGetLastError());
    for (int i = 0; i < 2; i++) {
@@ -409,13 +466,15 @@ int main(int argc, char** argv) {
    fast_gpu = 1;
    start_time[4] = std::chrono::system_clock::now();
    HIPCHECK(hipSetDevice(dev));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                       streams[0], loops, dev_array[0]);
    HIPCHECK(hipGetLastError());
    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
             hipCooperativeLaunchMultiDeviceNoPostSync));
    HIPCHECK(hipSetDevice(dev + 1));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                       streams[1], loops, dev_array[1]);
    for (int i = 0; i < 2; i++) {
      HIPCHECK(hipSetDevice(dev + i));
@@ -434,14 +493,16 @@ int main(int argc, char** argv) {
    std::cout << "  Kernel\n";
    start_time[5] = std::chrono::system_clock::now();
    HIPCHECK(hipSetDevice(dev));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                       streams[0], loops, dev_array[0]);
    HIPCHECK(hipGetLastError());
    HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
             hipCooperativeLaunchMultiDeviceNoPreSync |
             hipCooperativeLaunchMultiDeviceNoPostSync));
    HIPCHECK(hipSetDevice(dev + 1));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                       streams[1], loops, dev_array[1]);
    HIPCHECK(hipGetLastError());
    for (int i = 0; i < 2; i++) {
@@ -204,6 +204,81 @@ test_kernel(unsigned int *atomic_val, unsigned int *global_array,
  }
 }

+__global__ void
+test_kernel_gfx11(unsigned int *atomic_val, unsigned int *global_array,
+                  unsigned int *array, uint32_t loops) {
+#ifdef __HIP_PLATFORM_AMD__
+  cooperative_groups::grid_group grid = cooperative_groups::this_grid();
+  cooperative_groups::multi_grid_group mgrid =
+                      cooperative_groups::this_multi_grid();
+  unsigned rank = grid.thread_rank();
+  unsigned global_rank = mgrid.thread_rank();
+
+  int offset = blockIdx.x;
+  for (int i = 0; i < loops; i++) {
+    // Make the last thread run way behind everyone else.
+    // If the grid barrier below fails, then the other threads may hit the
+    // atomicInc instruction many times before the last thread ever gets
+    // to it.
+    // As such, without the barrier, the last array entry will eventually
+    // contain a very large value, defined by however many times the other
+    // wavefronts make it through this loop.
+    // If the barrier works, then it will likely contain some number
+    // near "total number of blocks". It will be the last wavefront to
+    // reach the atomicInc, but everyone will have only hit the atomic once.
+    if (rank == (grid.size() - 1)) {
+      long long time_diff = 0;
+      long long last_clock = wall_clock64();
+      do {
+        long long cur_clock = wall_clock64();
+        if (cur_clock > last_clock) {
+          time_diff += (cur_clock - last_clock);
+        }
+        // If it rolls over, we don't know how much to add to catch up.
+        // So just ignore those slipped cycles.
+        last_clock = cur_clock;
+      } while(time_diff < 1000000);
+    }
+    if (threadIdx.x == 0) {
+      array[offset] = atomicInc(atomic_val, UINT_MAX);
+    }
+    grid.sync();
+
+    // Make the last thread in the entire multi-grid run way behind
+    // everyone else.
+    // If the mgrid barrier below fails, then the two global_array entries
+    // will end up being out of sync, because the intermingling of adds
+    // and multiplies will not be aligned between to the two GPUs.
+    if (global_rank == (mgrid.size() - 1)) {
+      long long time_diff = 0;
+      long long last_clock = wall_clock64();
+      do {
+        long long cur_clock = wall_clock64();
+        if (cur_clock > last_clock) {
+          time_diff += (cur_clock - last_clock);
+        }
+        // If it rolls over, we don't know how much to add to catch up.
+        // So just ignore those slipped cycles.
+        last_clock = cur_clock;
+      } while(time_diff < 1000000);
+    }
+    // During even iterations, add into your own array entry
+    // During odd iterations, add into your partner's array entry
+    unsigned grid_rank = mgrid.grid_rank();
+    unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
+    if (rank == (grid.size() - 1)) {
+      if (i % mgrid.num_grids() == 0) {
+        global_array[grid_rank] += 2;
+      } else {
+        global_array[inter_gpu_offset] *= 2;
+      }
+    }
+    mgrid.sync();
+    offset += gridDim.x;
+  }
+#endif 
+}
+
 int main(int argc, char** argv) {
  hipError_t err;
  int num_devices = 0;
@@ -265,8 +340,9 @@ int main(int argc, char** argv) {
  int max_blocks_per_sm = INT_MAX;
  for (int i = 0; i < num_devices; i++) {
    HIPCHECK(hipSetDevice(device_num[i]));
+    auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-            &max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block, 0));
+            &max_blocks_per_sm_arr[i], test_kernel_used, num_threads_in_block, 0));
    if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
      max_blocks_per_sm = max_blocks_per_sm_arr[i];
    }
@@ -320,12 +396,13 @@ int main(int argc, char** argv) {

  void *dev_params[num_devices][4];
  hipLaunchParams md_params[num_devices];
+  auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
  for (int i = 0; i < num_devices; i++) {
    dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
    dev_params[i][1] = reinterpret_cast<void*>(&global_array);
    dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
    dev_params[i][3] = reinterpret_cast<void*>(&loops);
-    md_params[i].func = reinterpret_cast<void*>(test_kernel);
+    md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
    md_params[i].gridDim = requested_blocks;
    md_params[i].blockDim = num_threads_in_block;
    md_params[i].sharedMem = 0;
@@ -131,6 +131,48 @@ test_kernel(unsigned int *atomic_val, unsigned int *array,
  }
 }

+__global__ void
+test_kernel_gfx11(unsigned int *atomic_val, unsigned int *array,
+                  unsigned int loops) {
+#ifdef __HIP_PLATFORM_AMD__
+  cooperative_groups::grid_group grid = cooperative_groups::this_grid();
+  unsigned rank = grid.thread_rank();
+
+  int offset = blockIdx.x;
+  for (int i = 0; i < loops; i++) {
+    // Make the last thread run way behind everyone else.
+    // If the barrier below fails, then the other threads may hit the
+    // atomicInc instruction many times before the last thread ever gets
+    // to it.
+    // As such, without the barrier, the last array entry will eventually
+    // contain a very large value, defined by however many times the other
+    // wavefronts make it through this loop.
+    // If the barrier works, then it will likely contain some number
+    // near "total number of blocks". It will be the last wavefront to
+    // reach the atomicInc, but everyone will have only hit the atomic once.
+    if (rank == (grid.size() - 1)) {
+      long long time_diff = 0;
+      long long last_clock = wall_clock64();
+      do {
+        long long cur_clock = wall_clock64();
+        if (cur_clock > last_clock) {
+          time_diff += (cur_clock - last_clock);
+        }
+        // If it rolls over, we don't know how much to add to catch up.
+        // So just ignore those slipped cycles.
+        last_clock = cur_clock;
+      } while(time_diff < 1000000);
+    }
+
+    if (threadIdx.x == 0) {
+      array[offset] = atomicInc(&atomic_val[0], UINT_MAX);
+    }
+    grid.sync();
+    offset += gridDim.x;
+  }
+#endif
+}
+
 int main(int argc, char** argv) {
  hipError_t err;
  int device_num;
@@ -167,9 +209,10 @@ int main(int argc, char** argv) {

    int num_threads_in_block = block_size * warp_size;

+    auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
    // Calculate the device occupancy to know how many blocks can be run.
    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
-             test_kernel, num_threads_in_block, 0));
+             test_kernel_used, num_threads_in_block, 0));

    int requested_blocks = warps / block_size;
    if (requested_blocks > max_blocks_per_sm * num_sms) {
@@ -211,7 +254,8 @@ int main(int argc, char** argv) {
    params[0] = reinterpret_cast<void*>(&kernel_atomic);
    params[1] = reinterpret_cast<void*>(&kernel_buffer);
    params[2] = reinterpret_cast<void*>(&loops);
-    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used),
                                        requested_blocks,
                                        num_threads_in_block, params, 0, NULL));

@@ -204,6 +204,81 @@ test_kernel(unsigned int *atomic_val, unsigned int *global_array,
  }
 }

+__global__ void
+test_kernel_gfx11(unsigned int *atomic_val, unsigned int *global_array,
+                  unsigned int *array, uint32_t loops) {
+#ifdef __HIP_PLATFORM_AMD__
+  cooperative_groups::grid_group grid = cooperative_groups::this_grid();
+  cooperative_groups::multi_grid_group mgrid =
+                      cooperative_groups::this_multi_grid();
+  unsigned rank = grid.thread_rank();
+  unsigned global_rank = mgrid.thread_rank();
+
+  int offset = blockIdx.x;
+  for (int i = 0; i < loops; i++) {
+    // Make the last thread run way behind everyone else.
+    // If the grid barrier below fails, then the other threads may hit the
+    // atomicInc instruction many times before the last thread ever gets
+    // to it.
+    // As such, without the barrier, the last array entry will eventually
+    // contain a very large value, defined by however many times the other
+    // wavefronts make it through this loop.
+    // If the barrier works, then it will likely contain some number
+    // near "total number of blocks". It will be the last wavefront to
+    // reach the atomicInc, but everyone will have only hit the atomic once.
+    if (rank == (grid.size() - 1)) {
+      long long time_diff = 0;
+      long long last_clock = wall_clock64();
+      do {
+        long long cur_clock = wall_clock64();
+        if (cur_clock > last_clock) {
+          time_diff += (cur_clock - last_clock);
+        }
+        // If it rolls over, we don't know how much to add to catch up.
+        // So just ignore those slipped cycles.
+        last_clock = cur_clock;
+      } while(time_diff < 1000000);
+    }
+    if (threadIdx.x == 0) {
+      array[offset] = atomicInc(atomic_val, UINT_MAX);
+    }
+    grid.sync();
+
+    // Make the last thread in the entire multi-grid run way behind
+    // everyone else.
+    // If the mgrid barrier below fails, then the two global_array entries
+    // will end up being out of sync, because the intermingling of adds
+    // and multiplies will not be aligned between to the two GPUs.
+    if (global_rank == (mgrid.size() - 1)) {
+      long long time_diff = 0;
+      long long last_clock = wall_clock64();
+      do {
+        long long cur_clock = wall_clock64();
+        if (cur_clock > last_clock) {
+          time_diff += (cur_clock - last_clock);
+        }
+        // If it rolls over, we don't know how much to add to catch up.
+        // So just ignore those slipped cycles.
+        last_clock = cur_clock;
+      } while(time_diff < 1000000);
+    }
+    // During even iterations, add into your own array entry
+    // During odd iterations, add into your partner's array entry
+    unsigned grid_rank = mgrid.grid_rank();
+    unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
+    if (rank == (grid.size() - 1)) {
+      if (i % mgrid.num_grids() == 0) {
+        global_array[grid_rank] += 2;
+      } else {
+        global_array[inter_gpu_offset] *= 2;
+      }
+    }
+    mgrid.sync();
+    offset += gridDim.x;
+  }
+#endif
+}
+
 int main(int argc, char** argv) {
    hipError_t err;
    int device_num = 0, flag = 0;
@@ -263,8 +338,9 @@ int main(int argc, char** argv) {
    int max_blocks_per_sm = INT_MAX;
    for (int i = 0; i < 2; i++) {
      HIPCHECK(hipSetDevice((d + i)));
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-              &max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block,
+              &max_blocks_per_sm_arr[i], test_kernel_used, num_threads_in_block,
              0));
      if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
        max_blocks_per_sm = max_blocks_per_sm_arr[i];
@@ -319,11 +395,12 @@ int main(int argc, char** argv) {
    void *dev_params[2][4];
    hipLaunchParams md_params[2];
    for (int i = 0; i < 2; i++) {
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
      dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
      dev_params[i][1] = reinterpret_cast<void*>(&global_array);
      dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
      dev_params[i][3] = reinterpret_cast<void*>(&loops);
-      md_params[i].func = reinterpret_cast<void*>(test_kernel);
+      md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
      md_params[i].gridDim = requested_blocks;
      md_params[i].blockDim = num_threads_in_block;
      md_params[i].sharedMem = 0;
@@ -44,10 +44,20 @@ extern "C" __global__ void WaitKernel(int *Ad, int clockrate) {
  *Ad = 1;
 }

+extern "C" __global__ void WaitKernel_gfx11(int *Ad, int clockrate) {
+#ifdef __HIP_PLATFORM_AMD__
+  uint64_t wait_t = 500,
+  start = wall_clock64()/clockrate, cycles;
+  do { cycles = wall_clock64()/clockrate-start;} while (cycles < wait_t);
+  *Ad = 1;
+#endif
+}
+
 void t1(hipEvent_t start, hipStream_t stream1, int clkRate, int *A, int *Ad) {
  *A = 0;

-  hipLaunchKernelGGL(HIP_KERNEL_NAME(WaitKernel), dim3(1), dim3(1), 0, stream1, Ad, clkRate);
+  auto WaitKernel_used = IsGfx11() ? WaitKernel_gfx11 : WaitKernel;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(WaitKernel_used), dim3(1), dim3(1), 0, stream1, Ad, clkRate);

  HIPCHECK(hipEventRecord(start, stream1));

@@ -69,7 +79,11 @@ int main(int argc, char* argv[]) {
    HIPCHECK(hipHostGetDevicePointer((void**)&Ad[i], A[i], 0));
  }

-  HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  }
  hipStream_t stream1;
  hipStreamCreate(&stream1);
  hipEvent_t start;
@@ -62,6 +62,34 @@ __global__ void FourSecKernel(int clockrate) {
  }
 }

+__global__ void TwoSecKernel_gfx11(int clockrate) {
+#ifdef __HIP_PLATFORM_AMD__
+  if (globalvar == 0x2222) {
+    globalvar = 0x3333;
+  }
+  uint64_t wait_t = 2000,
+  start = wall_clock64()/clockrate, cur;
+  do { cur = (wall_clock64()/clockrate)-start;}while (cur < wait_t);
+  if (globalvar != 0x3333) {
+    globalvar = 0x5555;
+  }
+#endif
+}
+
+__global__ void FourSecKernel_gfx11(int clockrate) {
+#ifdef __HIP_PLATFORM_AMD__
+  if (globalvar == 1) {
+    globalvar = 0x2222;
+  }
+  uint64_t wait_t = 4000,
+  start = wall_clock64()/clockrate, cur;
+  do { cur = (wall_clock64()/clockrate)-start;}while (cur < wait_t);
+  if (globalvar == 0x2222) {
+    globalvar = 0x4444;
+  }
+#endif
+}
+
 /*
 * In this Scenario, we create events by disabling the timing flag
 * We then Launch the kernel using hipExtModuleLaunchKernel by passing
@@ -76,13 +104,20 @@ bool DisableTimeFlag() {
  float time_4sec, time_2sec;
  hipEvent_t  start_event1, end_event1;
  int clkRate = 0;
-  HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  }
+
  HIPCHECK(hipEventCreateWithFlags(&start_event1,
                                   hipEventDisableTiming));
  HIPCHECK(hipEventCreateWithFlags(&end_event1,
                                   hipEventDisableTiming));
  HIPCHECK(hipStreamCreate(&stream1));
-  hipExtLaunchKernelGGL((TwoSecKernel), dim3(1), dim3(1), 0,
+  auto TwoSecKernel_used = IsGfx11() ? TwoSecKernel_gfx11 : TwoSecKernel;
+  hipExtLaunchKernelGGL((TwoSecKernel_used), dim3(1), dim3(1), 0,
      stream1, start_event1, end_event1, 0, clkRate);
  HIPCHECK(hipStreamSynchronize(stream1));
  e = hipEventElapsedTime(&time_2sec, start_event1, end_event1);
@@ -113,11 +148,20 @@ bool ConcurencyCheck_GlobalVar(int conc_flag) {
  int deviceGlobal_h = 0;
  HIPCHECK(hipSetDevice(0));
  int clkRate = 0;
-  HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  }
+
  HIPCHECK(hipStreamCreate(&stream1));
-  hipExtLaunchKernelGGL((FourSecKernel), dim3(1), dim3(1), 0,
+  auto TwoSecKernel_used = IsGfx11() ? TwoSecKernel_gfx11 : TwoSecKernel;
+  auto FourSecKernel_used = IsGfx11() ? FourSecKernel_gfx11 : FourSecKernel;
+
+  hipExtLaunchKernelGGL((FourSecKernel_used), dim3(1), dim3(1), 0,
                         stream1, nullptr, nullptr, conc_flag, clkRate);
-  hipExtLaunchKernelGGL((TwoSecKernel), dim3(1), dim3(1), 0,
+  hipExtLaunchKernelGGL((TwoSecKernel_used), dim3(1), dim3(1), 0,
                         stream1, nullptr, nullptr, conc_flag, clkRate);
  HIPCHECK(hipStreamSynchronize(stream1));
  HIPCHECK(hipMemcpyFromSymbol(&deviceGlobal_h, globalvar,
@@ -148,16 +192,24 @@ bool KernelTimeExecution() {
  hipEvent_t  start_event1, end_event1, start_event2, end_event2;
  float time_4sec, time_2sec;
  int clkRate = 0;
-  HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  }
+
+  auto TwoSecKernel_used = IsGfx11() ? TwoSecKernel_gfx11 : TwoSecKernel;
+  auto FourSecKernel_used = IsGfx11() ? FourSecKernel_gfx11 : FourSecKernel;

  HIPCHECK(hipEventCreate(&start_event1));
  HIPCHECK(hipEventCreate(&end_event1));
  HIPCHECK(hipEventCreate(&start_event2));
  HIPCHECK(hipEventCreate(&end_event2));
  HIPCHECK(hipStreamCreate(&stream1));
-  hipExtLaunchKernelGGL((FourSecKernel), dim3(1), dim3(1), 0,
+  hipExtLaunchKernelGGL((FourSecKernel_used), dim3(1), dim3(1), 0,
                         stream1, start_event1, end_event1, 0, clkRate);
-  hipExtLaunchKernelGGL((TwoSecKernel), dim3(1), dim3(1), 0,
+  hipExtLaunchKernelGGL((TwoSecKernel_used), dim3(1), dim3(1), 0,
                          stream1, start_event2, end_event2, 0, clkRate);
  HIPCHECK(hipStreamSynchronize(stream1));
  e = hipEventElapsedTime(&time_4sec, start_event1, end_event1);
@@ -49,10 +49,7 @@

 #define fileName "matmul.code"
 #define matmulK "matmulK"
-#define SixteenSec "SixteenSecKernel"
 #define KernelandExtra "KernelandExtraParams"
-#define FourSec "FourSecKernel"
-#define TwoSec "TwoSecKernel"
 #define globalDevVar "deviceGlobal"
 #define dummyKernel "dummyKernel"
 #define FOURSEC_KERNEL 4999
@@ -124,7 +121,12 @@ void ModuleLaunchKernel::AllocateMemory() {
  HIPCHECK(hipMemcpy(Ad, A, SIZE*sizeof(int), hipMemcpyHostToDevice));
  HIPCHECK(hipMemcpy(Bd, B, SIZE*sizeof(int), hipMemcpyHostToDevice));
  int clkRate = 0;
-  HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  }
+
  args1._Ad = Ad;
  args1._Bd = Bd;
  args1._Cd = C;
@@ -149,13 +151,21 @@ void ModuleLaunchKernel::AllocateMemory() {
 }

 void ModuleLaunchKernel::ModuleLoad() {
+
+  std::string TwoSecStr = IsGfx11() ? std::string("TwoSecKernel_gfx11")
+                                      : std::string("TwoSecKernel");
+  std::string FourSecStr = IsGfx11() ? std::string("FourSecKernel_gfx11")
+                                       : std::string("FourSecKernel");
+  std::string SixteenSecStr = IsGfx11() ? std::string("SixteenSecKernel_gfx11")
+                                          : std::string("SixteenSecKernel");
+
  HIPCHECK(hipModuleLoad(&Module, fileName));
  HIPCHECK(hipModuleGetFunction(&MultKernel, Module, matmulK));
-  HIPCHECK(hipModuleGetFunction(&SixteenSecKernel, Module, SixteenSec));
+  HIPCHECK(hipModuleGetFunction(&SixteenSecKernel, Module, SixteenSecStr.c_str()));
  HIPCHECK(hipModuleGetFunction(&KernelandExtraParamKernel,
                                Module, KernelandExtra));
-  HIPCHECK(hipModuleGetFunction(&FourSecKernel, Module, FourSec));
-  HIPCHECK(hipModuleGetFunction(&TwoSecKernel, Module, TwoSec));
+  HIPCHECK(hipModuleGetFunction(&FourSecKernel, Module, FourSecStr.c_str()));
+  HIPCHECK(hipModuleGetFunction(&TwoSecKernel, Module, TwoSecStr.c_str()));
  HIPCHECK(hipModuleGetFunction(&DummyKernel, Module, dummyKernel));
  HIPCHECK(hipModuleGetGlobal(&deviceGlobal, &deviceGlobalSize,
                              Module, globalDevVar));
@@ -79,5 +79,41 @@ extern "C" __global__ void FourSecKernel(int clockrate) {
  }
 }

+extern "C" __global__ void SixteenSecKernel_gfx11(int clockrate) {
+#ifdef __HIP_PLATFORM_AMD__
+  uint64_t wait_t = 16000,
+  start = wall_clock64()/clockrate, cur;
+  do { cur = wall_clock64()/clockrate-start;}while (cur < wait_t);
+#endif
+}
+
+extern "C" __global__ void TwoSecKernel_gfx11(int clockrate) {
+#ifdef __HIP_PLATFORM_AMD__
+  if (deviceGlobal == 0x2222) {
+    deviceGlobal = 0x3333;
+  }
+  uint64_t wait_t = 2000,
+  start = wall_clock64()/clockrate, cur;
+  do { cur = wall_clock64()/clockrate-start;}while (cur < wait_t);
+  if (deviceGlobal != 0x3333) {
+    deviceGlobal = 0x5555;
+  }
+#endif
+}
+
+extern "C" __global__ void FourSecKernel_gfx11(int clockrate) {
+#ifdef __HIP_PLATFORM_AMD__
+  if (deviceGlobal == 1) {
+    deviceGlobal = 0x2222;
+  }
+  uint64_t wait_t = 4000,
+  start = wall_clock64()/clockrate, cur;
+  do { cur = wall_clock64()/clockrate-start;}while (cur < wait_t);
+  if (deviceGlobal == 0x2222) {
+    deviceGlobal = 0x4444;
+  }
+#endif
+}
+
 extern "C" __global__ void dummyKernel() {
 }
@@ -64,6 +64,25 @@ __global__ void vector_square(float* C_d, float* A_d, size_t Num) {
    }
 }

+__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t Num) {
+#ifdef __HIP_PLATFORM_AMD__
+  size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = gputhread; i < Num; i += stride) {
+      C_d[i] = A_d[i] * A_d[i];
+  }
+
+  // Delay thread 1 only in the GPU
+  if (gputhread == 1) {
+    unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur;
+      do {
+          cur = wall_clock64() - start;
+      } while (cur < wait_t);
+    }
+#endif
+}
+
 float *A_h, *C_h, *A_h1, *C_h1;

 static void HIPRT_CB Callback_Stream1(hipStream_t stream, hipError_t status,
@@ -129,8 +148,9 @@ int main(int argc, char* argv[]) {
  int *ptr = NULL;
  int *ptr1 = NULL;
  // Queing jobs in both mystream1/2 followed by hipStreamAddCallback
+  auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square;
  for (int i = 1; i < 5; ++i) {
-    hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock),
+    hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock),
                       0, mystream1, C_d, A_d, Num);
    HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost,
                            mystream1));
@@ -139,7 +159,7 @@ int main(int argc, char* argv[]) {
    HIPCHECK(hipStreamAddCallback(mystream1, Callback_Stream1,
                                  reinterpret_cast<void*>(ptr), 0));

-    hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock),
+    hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock),
                       0, mystream2, C_d, A_d, Num);
    HIPCHECK(hipMemcpyAsync(C_h1, C_d, Nbytes,
                            hipMemcpyDeviceToHost, mystream2));
@@ -64,6 +64,25 @@ __global__ void vector_square(float* C_d, float* A_d, size_t Num) {
  }
 }

+__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t Num) {
+#ifdef __HIP_PLATFORM_AMD__
+  size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = gputhread; i < Num; i += stride) {
+    C_d[i] = A_d[i] * A_d[i];
+  }
+
+  // Delay thread 1 only in the GPU
+  if (gputhread == 1) {
+    unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur;
+    do {
+      cur = wall_clock64() - start;
+    } while (cur < wait_t);
+  }
+#endif
+}
+
 static void HIPRT_CB Stream_Callback(hipStream_t stream, hipError_t status,
                                     void* userData) {
  for (size_t i = 0; i < Num; i++) {
@@ -100,7 +119,8 @@ int main(int argc, char* argv[]) {

  const unsigned threadsPerBlock = 256;
  const unsigned blocks = (Num+255)/threadsPerBlock;
-  hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0,
+  auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square;
+  hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0,
                      mystream, C_d, A_d, Num);

  HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
@@ -64,6 +64,24 @@ __global__ void vector_square(float* C_d, float* A_d, size_t Num) {
  }
 }

+__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t Num) {
+#ifdef __HIP_PLATFORM_AMD__
+  size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = gputhread; i < Num; i += stride) {
+    C_d[i] = A_d[i] * A_d[i];
+  }
+
+  // Delay thread 1 only in the GPU
+  if (gputhread == 1) {
+    unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur;
+    do {
+      cur = wall_clock64() - start;
+    } while (cur < wait_t);
+  }
+#endif
+}

 static void HIPRT_CB Thread1_Callback(hipStream_t stream, hipError_t status,
                                      void* userData) {
@@ -124,7 +142,8 @@ int main(int argc, char* argv[]) {
  const unsigned threadsPerBlock = 256;
  const unsigned blocks = (Num+255)/threadsPerBlock;

-  hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0,
+  auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square;
+  hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0,
                      mystream, C_d, A_d, Num);

  HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
@@ -61,6 +61,25 @@ __global__ void vector_square(float* C_d, float* A_d, size_t N_elmts) {
  }
 }

+__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t N_elmts) {
+#ifdef __HIP_PLATFORM_AMD__
+  size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = offset; i < N_elmts; i += stride) {
+    C_d[i] = A_d[i] * A_d[i];
+  }
+
+  // Delay the thread 1
+  if (offset == 1) {
+    unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur;
+    do {
+      cur = wall_clock64() - start;
+    } while (cur < wait_t);
+  }
+#endif
+}
+
 float *A_h, *C_h;

 static void HIPRT_CB Callback1(hipStream_t stream, hipError_t status,
@@ -102,7 +121,8 @@ int main(int argc, char* argv[]) {
  const unsigned threadsPerBlock = 256;
  const unsigned blocks = (N_elmts + 255)/threadsPerBlock;

-  hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0,
+  auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square;
+  hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0,
                      mystream, C_d, A_d, N_elmts);
  HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
  HIPCHECK(hipStreamAddCallback(mystream, Callback1, NULL, 0));
@@ -92,6 +92,32 @@ size_t getHostThreadCount(const size_t memPerThread, const size_t maxThreads) {
  return thread_count;
 }

+// Function to determine if the device is of gfx11 architecture
+bool IsGfx11() {
+#if defined(__HIP_PLATFORM_NVIDIA__)
+  return false;
+#elif defined(__HIP_PLATFORM_AMD__)
+  int device = -1;
+  hipDeviceProp_t props{};
+  HIPCHECK(hipGetDevice(&device));
+  HIPCHECK(hipGetDeviceProperties(&props, device));
+
+  // Get GCN Arch Name and compare to check if it is gfx11
+  std::string arch = std::string(props.gcnArchName);
+  auto pos = arch.find(":");
+  if (pos != std::string::npos)
+    arch = arch.substr(0, pos);
+
+  if(arch.size() >= 5)
+    arch = arch.substr(0,5);
+
+  return (arch == std::string("gfx11")) ? true : false;
+#else
+  std::cout<<"Have to be either Nvidia or AMD platform, asserting"<<std::endl;
+  assert(false);
+#endif
+}
+
 namespace HipTest {


@@ -232,5 +258,4 @@ unsigned setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N)
    return blocks;
 }

-
 }  // namespace HipTest
@@ -581,3 +581,6 @@ inline bool isImageSupported() {
    if (!HipTest::isImageSupported())                                                      \
        { printf("Texture is not support on the device. Skipped.\n"); passed(); }
 #endif //__cplusplus
+
+// Function to determine if the device is of gfx11 architecture
+bool IsGfx11();