diff --git a/tests/src/deviceLib/hipTestClock.cpp b/tests/src/deviceLib/hipTestClock.cpp
index bf6a572cfc..7e70d32b4d 100644
--- a/tests/src/deviceLib/hipTestClock.cpp
+++ b/tests/src/deviceLib/hipTestClock.cpp
@@ -43,6 +43,20 @@ THE SOFTWARE.
       Ad[tid] = clock() + clock64() + __clock() + __clock64() - Ad[tid];
   }
 
+  static __global__ void kernel1_gfx11(long long* Ad) {
+#ifdef __HIP_PLATFORM_AMD__
+      int tid = threadIdx.x + blockIdx.x * blockDim.x;
+      Ad[tid] = clock() + wall_clock64() + __clock() + __clock64();
+#endif
+  }
+
+  static __global__ void kernel2_gfx11(long long* Ad) {
+#ifdef __HIP_PLATFORM_AMD__
+      int tid = threadIdx.x + blockIdx.x * blockDim.x;
+      Ad[tid] = clock() + wall_clock64() + __clock() + __clock64() - Ad[tid];
+#endif
+  }
+
   void run() {
     long long *A, *Ad;
     A = new long long[LEN];
@@ -50,9 +64,15 @@ THE SOFTWARE.
         A[i] = 0;
     }
 
+    auto kernel1_used = IsGfx11() ? kernel1_gfx11 : kernel1;
+    auto kernel2_used = IsGfx11() ? kernel2_gfx11 : kernel2;
+
     HIP_ASSERT(hipMalloc((void**)&Ad, SIZE));
-    hipLaunchKernelGGL(kernel1, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
-    hipLaunchKernelGGL(kernel2, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
+
+    hipLaunchKernelGGL(kernel1_used, dim3(1, 1, 1),
+                       dim3(LEN, 1, 1), 0, 0, Ad);
+    hipLaunchKernelGGL(kernel2_used, dim3(1, 1, 1),
+                       dim3(LEN, 1, 1), 0, 0, Ad);
     HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost));
 
     for (unsigned i = 0; i < LEN; i++) {
diff --git a/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp b/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp
index 544b9ee775..91f47359b5 100644
--- a/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp
+++ b/tests/src/runtimeApi/cooperativeGrps/api_failure_tests.cpp
@@ -124,6 +124,13 @@ __global__ void test_kernel(long long *array) {
   array[rank] += clock64();
 }
 
+__global__ void test_kernel_gfx11(long long *array) {
+#ifdef __HIP_PLATFORM_AMD__
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+  array[rank] += wall_clock64();
+#endif
+}
+
 int main(int argc, char** argv) {
   hipError_t err;
   int device_num, FailFlag = 0;
@@ -155,6 +162,7 @@ int main(int argc, char** argv) {
     int num_sms = device_properties.multiProcessorCount;
     int max_num_threads = device_properties.maxThreadsPerBlock;
 
+    auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
     // Check single-thread block, all numbers of warps, then too-large block
     for (int block_size = 0; block_size <= (max_num_threads + warp_size);
          block_size += warp_size) {
@@ -163,9 +171,8 @@ int main(int argc, char** argv) {
       }
       int max_blocks_per_sm;
       // Calculate the device occupancy to know how many blocks can be run.
-      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-              &max_blocks_per_sm, test_kernel, block_size, 0,
-              hipOccupancyDefault));
+      HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&max_blocks_per_sm,
+               test_kernel_used, block_size, 0, hipOccupancyDefault));
 
       if ((block_size > max_num_threads) && (max_blocks_per_sm != 0)) {
         std::cerr << "ERROR! Occupancy API indicated that we can have >0 ";
@@ -212,7 +219,7 @@ int main(int argc, char** argv) {
         coop_params[i][0] = reinterpret_cast<void*>(&dev_array[i]);
       }
 
-      err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
+      err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used),
                                        2 * desired_blocks, block_size,
                                        coop_params[0], 0, streams[0]);
 
@@ -235,9 +242,8 @@ int main(int argc, char** argv) {
       }
 
       HIPCHECK(hipDeviceSynchronize());
-      err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
-                                       desired_blocks, block_size,
-                                       coop_params[1], 0, streams[1]);
+      err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), desired_blocks,
+                                       block_size, coop_params[1], 0, streams[1]);
 
       if (expect_fail) {
         expect_to_see = hipErrorInvalidConfiguration;
diff --git a/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp b/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp
index 07a4f7711c..70ce8b6a80 100644
--- a/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp
+++ b/tests/src/runtimeApi/cooperativeGrps/cooperative_streams.cpp
@@ -106,6 +106,29 @@ __global__ void test_kernel(uint32_t loops, unsigned long long *array, long long
   }
 }
 
+__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long *array, long long totalTicks) {
+#ifdef __HIP_PLATFORM_AMD__
+  cooperative_groups::thread_block tb = cooperative_groups::this_thread_block();
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int i = 0; i < loops; i++) {
+    long long time_diff = 0;
+    long long last_clock = wall_clock64();
+    do {
+      long long cur_clock = wall_clock64();
+      if (cur_clock > last_clock) {
+        time_diff += (cur_clock - last_clock);
+      }
+      // If it rolls over, we don't know how much to add to catch up.
+      // So just ignore those slipped cycles.
+      last_clock = cur_clock;
+    } while(time_diff < totalTicks);
+    tb.sync();
+    array[rank] += wall_clock64();
+  }
+#endif
+}
+
 template<typename T>
 bool verifyLeastCapacity(T& single_kernel_time, T& double_kernel_time, T& triple_kernel_time)
 {
@@ -256,8 +279,8 @@ int main(int argc, char** argv) {
     long long totalTicks = device_properties.clockRate ;
     int max_blocks_per_sm = 0;
     // Calculate the device occupancy to know how many blocks can be run.
-    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
-                                                          test_kernel,
+    auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, test_kernel_used,
                                                           warp_size, 0));
     int max_active_blocks = max_blocks_per_sm * num_sms;
     int coop_blocks = 0;
@@ -325,9 +348,10 @@ int main(int argc, char** argv) {
     }
 
     // Verify over capacity
-    HIPCHECK_API(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
+    HIPCHECK_API(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used),
                                      max_active_blocks + 1, warp_size,
-                                     coop_params[0], 0, streams[0]), hipErrorCooperativeLaunchTooLarge);
+                                     coop_params[0], 0, streams[0]),
+                                     hipErrorCooperativeLaunchTooLarge);
 
     std::cout << "Launching an initial single cooperative kernel..." << std::endl;
     // We need exclude the the initial launching as it will need time to load code obj.
diff --git a/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp b/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp
index 2957b0420c..40d20a48de 100644
--- a/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp
+++ b/tests/src/runtimeApi/cooperativeGrps/multi_gpu_api_failure_tests.cpp
@@ -192,6 +192,20 @@ __global__ void second_test_kernel(long long *array) {
     array[rank] += clock64();
 }
 
+__global__ void test_kernel_gfx11(long long *array) {
+#ifdef __HIP_PLATFORM_AMD__
+    unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+    array[rank] += wall_clock64();
+#endif
+}
+
+__global__ void second_test_kernel_gfx11(long long *array) {
+#ifdef __HIP_PLATFORM_AMD__
+    unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+    array[rank] += wall_clock64();
+#endif
+}
+
 int main(int argc, char** argv) {
   hipError_t err;
   /*************************************************************************/
@@ -245,8 +259,9 @@ int main(int argc, char** argv) {
     int max_blocks_per_sm = INT_MAX;
     for (int i = 0; i < 2; i++) {
       HIPCHECK(hipSetDevice((dev + i)));
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; 
       HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-               &max_blocks_per_sm_arr[i], test_kernel, warp_size, 0));
+               &max_blocks_per_sm_arr[i], test_kernel_used, warp_size, 0));
       if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
           max_blocks_per_sm = max_blocks_per_sm_arr[i];
       }
@@ -293,7 +308,8 @@ int main(int argc, char** argv) {
     for (int i = 0; i < 2; i++) {
       dev_params[i][0] = reinterpret_cast<void*>(&bad_dev_array[i]);
 
-      md_params[i].func = reinterpret_cast<void*>(test_kernel);
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+      md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
       md_params[i].gridDim = 2 * desired_blocks;
       md_params[i].blockDim = warp_size;
       md_params[i].sharedMem = 0;
@@ -370,7 +386,8 @@ int main(int argc, char** argv) {
         supports_sep_kernels = false;
       }
     }
-    md_params[1].func = reinterpret_cast<void*>(second_test_kernel);
+    auto second_test_kernel_used = IsGfx11() ? second_test_kernel_gfx11 : second_test_kernel;
+    md_params[1].func = reinterpret_cast<void*>(second_test_kernel_used);
     err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
     if ((supports_sep_kernels && err != hipSuccess) ||
         (!supports_sep_kernels && err != hipErrorInvalidValue)) {
@@ -405,7 +422,8 @@ int main(int argc, char** argv) {
     std::cout << "different grid sizes." << std::endl;
     bool supports_sep_sizes = true;
     for (int i = 0; i < 2; i++) {
-      md_params[i].func = reinterpret_cast<void*>(test_kernel);
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+      md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
       md_params[i].gridDim = i+1;
       if (!support_for_separate_grid_sizes((dev + i))) {
         supports_sep_sizes = false;
diff --git a/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp b/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp
index 120e32353f..28bba39785 100644
--- a/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp
+++ b/tests/src/runtimeApi/cooperativeGrps/multi_gpu_streams.cpp
@@ -195,6 +195,55 @@ __global__ void test_kernel(uint32_t loops, unsigned long long *array) {
   }
 }
 
+__global__ void test_coop_kernel_gfx11(unsigned int loops, long long *array,
+                                       int fast_gpu) {
+#ifdef __HIP_PLATFORM_AMD__
+  cooperative_groups::multi_grid_group mgrid =
+  cooperative_groups::this_multi_grid();
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (mgrid.grid_rank() == fast_gpu) {
+    return;
+  }
+
+  for (int i = 0; i < loops; i++) {
+    long long time_diff = 0;
+    long long last_clock = wall_clock64();
+    do {
+      long long cur_clock = wall_clock64();
+      if (cur_clock > last_clock) {
+        time_diff += (cur_clock - last_clock);
+      }
+      // If it rolls over, we don't know how much to add to catch up.
+      // So just ignore those slipped cycles.
+      last_clock = cur_clock;
+    } while(time_diff < 1000000);
+    array[rank] += wall_clock64();
+  }
+#endif
+}
+
+__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long *array) {
+#ifdef __HIP_PLATFORM_AMD__
+  unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int i = 0; i < loops; i++) {
+    long long time_diff = 0;
+    long long last_clock = wall_clock64();
+    do {
+      long long cur_clock = wall_clock64();
+      if (cur_clock > last_clock) {
+        time_diff += (cur_clock - last_clock);
+      }
+      // If it rolls over, we don't know how much to add to catch up.
+      // So just ignore those slipped cycles.
+      last_clock = cur_clock;
+    } while(time_diff < 1000000);
+    array[rank] += wall_clock64();
+  }
+#endif
+}
+
 int main(int argc, char** argv) {
   hipError_t err;
   int device_num, FailFlag = 0;
@@ -249,8 +298,9 @@ int main(int argc, char** argv) {
     int max_blocks_per_sm = INT_MAX;
     for (int i = 0; i < 2; i++) {
       HIPCHECK(hipSetDevice(dev + i));
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
       HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-               &max_blocks_per_sm_arr[i], test_kernel, warp_size, 0));
+               &max_blocks_per_sm_arr[i], test_kernel_used, warp_size, 0));
       if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
         max_blocks_per_sm = max_blocks_per_sm_arr[i];
       }
@@ -302,11 +352,12 @@ int main(int argc, char** argv) {
     std::cout << "GPU " << dev << ": Long Coop Kernel" << std::endl;
     std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel" << std::endl;
 
+    auto test_coop_kernel_used = IsGfx11() ? test_coop_kernel_gfx11 : test_coop_kernel;
     for (int i = 0; i < 2; i++) {
       dev_params[i][0] = reinterpret_cast<void*>(&loops);
       dev_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
       dev_params[i][2] = reinterpret_cast<void*>(&fast_gpu);
-      md_params[i].func = reinterpret_cast<void*>(test_coop_kernel);
+      md_params[i].func = reinterpret_cast<void*>(test_coop_kernel_used);
       md_params[i].gridDim = desired_blocks;
       md_params[i].blockDim = warp_size;
       md_params[i].sharedMem = 0;
@@ -331,12 +382,14 @@ int main(int argc, char** argv) {
     fast_gpu = 1;
     start_time[1] = std::chrono::system_clock::now();
     HIPCHECK(hipSetDevice(dev));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                        streams[0], loops, dev_array[0]);
     HIPCHECK(hipGetLastError());
     HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
     HIPCHECK(hipSetDevice(dev + 1));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                        streams[1], loops, dev_array[1]);
     HIPCHECK(hipGetLastError());
     for (int i = 0; i < 2; i++) {
@@ -356,12 +409,14 @@ int main(int argc, char** argv) {
     fast_gpu = 0;
     start_time[2] = std::chrono::system_clock::now();
     HIPCHECK(hipSetDevice(dev));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                        streams[0], loops, dev_array[0]);
     HIPCHECK(hipGetLastError());
     HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
     HIPCHECK(hipSetDevice(dev + 1));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                        streams[1], loops, dev_array[1]);
     HIPCHECK(hipGetLastError());
     for (int i = 0; i < 2; i++) {
@@ -382,13 +437,15 @@ int main(int argc, char** argv) {
     fast_gpu = 0;
     start_time[3] = std::chrono::system_clock::now();
     HIPCHECK(hipSetDevice(dev));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                        streams[0], loops, dev_array[0]);
     HIPCHECK(hipGetLastError());
     HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
              hipCooperativeLaunchMultiDeviceNoPreSync));
     HIPCHECK(hipSetDevice(dev + 1));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                        streams[1], loops, dev_array[1]);
     HIPCHECK(hipGetLastError());
     for (int i = 0; i < 2; i++) {
@@ -409,13 +466,15 @@ int main(int argc, char** argv) {
     fast_gpu = 1;
     start_time[4] = std::chrono::system_clock::now();
     HIPCHECK(hipSetDevice(dev));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                        streams[0], loops, dev_array[0]);
     HIPCHECK(hipGetLastError());
     HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
              hipCooperativeLaunchMultiDeviceNoPostSync));
     HIPCHECK(hipSetDevice(dev + 1));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                        streams[1], loops, dev_array[1]);
     for (int i = 0; i < 2; i++) {
       HIPCHECK(hipSetDevice(dev + i));
@@ -434,14 +493,16 @@ int main(int argc, char** argv) {
     std::cout << "  Kernel\n";
     start_time[5] = std::chrono::system_clock::now();
     HIPCHECK(hipSetDevice(dev));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                        streams[0], loops, dev_array[0]);
     HIPCHECK(hipGetLastError());
     HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
              hipCooperativeLaunchMultiDeviceNoPreSync |
              hipCooperativeLaunchMultiDeviceNoPostSync));
     HIPCHECK(hipSetDevice(dev + 1));
-    hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
                        streams[1], loops, dev_array[1]);
     HIPCHECK(hipGetLastError());
     for (int i = 0; i < 2; i++) {
diff --git a/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp b/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp
index 2623e5bc16..6958130e5c 100644
--- a/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp
+++ b/tests/src/runtimeApi/cooperativeGrps/multi_grid_group_all_gpus.cpp
@@ -204,6 +204,81 @@ test_kernel(unsigned int *atomic_val, unsigned int *global_array,
   }
 }
 
+__global__ void
+test_kernel_gfx11(unsigned int *atomic_val, unsigned int *global_array,
+                  unsigned int *array, uint32_t loops) {
+#ifdef __HIP_PLATFORM_AMD__
+  cooperative_groups::grid_group grid = cooperative_groups::this_grid();
+  cooperative_groups::multi_grid_group mgrid =
+                      cooperative_groups::this_multi_grid();
+  unsigned rank = grid.thread_rank();
+  unsigned global_rank = mgrid.thread_rank();
+
+  int offset = blockIdx.x;
+  for (int i = 0; i < loops; i++) {
+    // Make the last thread run way behind everyone else.
+    // If the grid barrier below fails, then the other threads may hit the
+    // atomicInc instruction many times before the last thread ever gets
+    // to it.
+    // As such, without the barrier, the last array entry will eventually
+    // contain a very large value, defined by however many times the other
+    // wavefronts make it through this loop.
+    // If the barrier works, then it will likely contain some number
+    // near "total number of blocks". It will be the last wavefront to
+    // reach the atomicInc, but everyone will have only hit the atomic once.
+    if (rank == (grid.size() - 1)) {
+      long long time_diff = 0;
+      long long last_clock = wall_clock64();
+      do {
+        long long cur_clock = wall_clock64();
+        if (cur_clock > last_clock) {
+          time_diff += (cur_clock - last_clock);
+        }
+        // If it rolls over, we don't know how much to add to catch up.
+        // So just ignore those slipped cycles.
+        last_clock = cur_clock;
+      } while(time_diff < 1000000);
+    }
+    if (threadIdx.x == 0) {
+      array[offset] = atomicInc(atomic_val, UINT_MAX);
+    }
+    grid.sync();
+
+    // Make the last thread in the entire multi-grid run way behind
+    // everyone else.
+    // If the mgrid barrier below fails, then the two global_array entries
+    // will end up being out of sync, because the intermingling of adds
+    // and multiplies will not be aligned between to the two GPUs.
+    if (global_rank == (mgrid.size() - 1)) {
+      long long time_diff = 0;
+      long long last_clock = wall_clock64();
+      do {
+        long long cur_clock = wall_clock64();
+        if (cur_clock > last_clock) {
+          time_diff += (cur_clock - last_clock);
+        }
+        // If it rolls over, we don't know how much to add to catch up.
+        // So just ignore those slipped cycles.
+        last_clock = cur_clock;
+      } while(time_diff < 1000000);
+    }
+    // During even iterations, add into your own array entry
+    // During odd iterations, add into your partner's array entry
+    unsigned grid_rank = mgrid.grid_rank();
+    unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
+    if (rank == (grid.size() - 1)) {
+      if (i % mgrid.num_grids() == 0) {
+        global_array[grid_rank] += 2;
+      } else {
+        global_array[inter_gpu_offset] *= 2;
+      }
+    }
+    mgrid.sync();
+    offset += gridDim.x;
+  }
+#endif 
+}
+
 int main(int argc, char** argv) {
   hipError_t err;
   int num_devices = 0;
@@ -265,8 +340,9 @@ int main(int argc, char** argv) {
   int max_blocks_per_sm = INT_MAX;
   for (int i = 0; i < num_devices; i++) {
     HIPCHECK(hipSetDevice(device_num[i]));
+    auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
     HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-            &max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block, 0));
+            &max_blocks_per_sm_arr[i], test_kernel_used, num_threads_in_block, 0));
     if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
       max_blocks_per_sm = max_blocks_per_sm_arr[i];
     }
@@ -320,12 +396,13 @@ int main(int argc, char** argv) {
 
   void *dev_params[num_devices][4];
   hipLaunchParams md_params[num_devices];
+  auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
   for (int i = 0; i < num_devices; i++) {
     dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
     dev_params[i][1] = reinterpret_cast<void*>(&global_array);
     dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
     dev_params[i][3] = reinterpret_cast<void*>(&loops);
-    md_params[i].func = reinterpret_cast<void*>(test_kernel);
+    md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
     md_params[i].gridDim = requested_blocks;
     md_params[i].blockDim = num_threads_in_block;
     md_params[i].sharedMem = 0;
diff --git a/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp b/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp
index 2256369b04..1e0d15529e 100644
--- a/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp
+++ b/tests/src/runtimeApi/cooperativeGrps/simple_grid_group_barrier.cpp
@@ -131,6 +131,48 @@ test_kernel(unsigned int *atomic_val, unsigned int *array,
   }
 }
 
+__global__ void
+test_kernel_gfx11(unsigned int *atomic_val, unsigned int *array,
+                  unsigned int loops) {
+#ifdef __HIP_PLATFORM_AMD__
+  cooperative_groups::grid_group grid = cooperative_groups::this_grid();
+  unsigned rank = grid.thread_rank();
+
+  int offset = blockIdx.x;
+  for (int i = 0; i < loops; i++) {
+    // Make the last thread run way behind everyone else.
+    // If the barrier below fails, then the other threads may hit the
+    // atomicInc instruction many times before the last thread ever gets
+    // to it.
+    // As such, without the barrier, the last array entry will eventually
+    // contain a very large value, defined by however many times the other
+    // wavefronts make it through this loop.
+    // If the barrier works, then it will likely contain some number
+    // near "total number of blocks". It will be the last wavefront to
+    // reach the atomicInc, but everyone will have only hit the atomic once.
+    if (rank == (grid.size() - 1)) {
+      long long time_diff = 0;
+      long long last_clock = wall_clock64();
+      do {
+        long long cur_clock = wall_clock64();
+        if (cur_clock > last_clock) {
+          time_diff += (cur_clock - last_clock);
+        }
+        // If it rolls over, we don't know how much to add to catch up.
+        // So just ignore those slipped cycles.
+        last_clock = cur_clock;
+      } while(time_diff < 1000000);
+    }
+
+    if (threadIdx.x == 0) {
+      array[offset] = atomicInc(&atomic_val[0], UINT_MAX);
+    }
+    grid.sync();
+    offset += gridDim.x;
+  }
+#endif
+}
+
 int main(int argc, char** argv) {
   hipError_t err;
   int device_num;
@@ -167,9 +209,10 @@ int main(int argc, char** argv) {
 
     int num_threads_in_block = block_size * warp_size;
 
+    auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
     // Calculate the device occupancy to know how many blocks can be run.
     HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
-             test_kernel, num_threads_in_block, 0));
+             test_kernel_used, num_threads_in_block, 0));
 
     int requested_blocks = warps / block_size;
     if (requested_blocks > max_blocks_per_sm * num_sms) {
@@ -211,7 +254,8 @@ int main(int argc, char** argv) {
     params[0] = reinterpret_cast<void*>(&kernel_atomic);
     params[1] = reinterpret_cast<void*>(&kernel_buffer);
     params[2] = reinterpret_cast<void*>(&loops);
-    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
+    test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
+    HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used),
                                         requested_blocks,
                                         num_threads_in_block, params, 0, NULL));
 
diff --git a/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp b/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp
index cd7aa1ce82..73e28026a0 100644
--- a/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp
+++ b/tests/src/runtimeApi/cooperativeGrps/simple_multi_grid_group_barrier.cpp
@@ -204,6 +204,81 @@ test_kernel(unsigned int *atomic_val, unsigned int *global_array,
   }
 }
 
+__global__ void
+test_kernel_gfx11(unsigned int *atomic_val, unsigned int *global_array,
+                  unsigned int *array, uint32_t loops) {
+#ifdef __HIP_PLATFORM_AMD__
+  cooperative_groups::grid_group grid = cooperative_groups::this_grid();
+  cooperative_groups::multi_grid_group mgrid =
+                      cooperative_groups::this_multi_grid();
+  unsigned rank = grid.thread_rank();
+  unsigned global_rank = mgrid.thread_rank();
+
+  int offset = blockIdx.x;
+  for (int i = 0; i < loops; i++) {
+    // Make the last thread run way behind everyone else.
+    // If the grid barrier below fails, then the other threads may hit the
+    // atomicInc instruction many times before the last thread ever gets
+    // to it.
+    // As such, without the barrier, the last array entry will eventually
+    // contain a very large value, defined by however many times the other
+    // wavefronts make it through this loop.
+    // If the barrier works, then it will likely contain some number
+    // near "total number of blocks". It will be the last wavefront to
+    // reach the atomicInc, but everyone will have only hit the atomic once.
+    if (rank == (grid.size() - 1)) {
+      long long time_diff = 0;
+      long long last_clock = wall_clock64();
+      do {
+        long long cur_clock = wall_clock64();
+        if (cur_clock > last_clock) {
+          time_diff += (cur_clock - last_clock);
+        }
+        // If it rolls over, we don't know how much to add to catch up.
+        // So just ignore those slipped cycles.
+        last_clock = cur_clock;
+      } while(time_diff < 1000000);
+    }
+    if (threadIdx.x == 0) {
+      array[offset] = atomicInc(atomic_val, UINT_MAX);
+    }
+    grid.sync();
+
+    // Make the last thread in the entire multi-grid run way behind
+    // everyone else.
+    // If the mgrid barrier below fails, then the two global_array entries
+    // will end up being out of sync, because the intermingling of adds
+    // and multiplies will not be aligned between to the two GPUs.
+    if (global_rank == (mgrid.size() - 1)) {
+      long long time_diff = 0;
+      long long last_clock = wall_clock64();
+      do {
+        long long cur_clock = wall_clock64();
+        if (cur_clock > last_clock) {
+          time_diff += (cur_clock - last_clock);
+        }
+        // If it rolls over, we don't know how much to add to catch up.
+        // So just ignore those slipped cycles.
+        last_clock = cur_clock;
+      } while(time_diff < 1000000);
+    }
+    // During even iterations, add into your own array entry
+    // During odd iterations, add into your partner's array entry
+    unsigned grid_rank = mgrid.grid_rank();
+    unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
+    if (rank == (grid.size() - 1)) {
+      if (i % mgrid.num_grids() == 0) {
+        global_array[grid_rank] += 2;
+      } else {
+        global_array[inter_gpu_offset] *= 2;
+      }
+    }
+    mgrid.sync();
+    offset += gridDim.x;
+  }
+#endif
+}
+
 int main(int argc, char** argv) {
     hipError_t err;
     int device_num = 0, flag = 0;
@@ -263,8 +338,9 @@ int main(int argc, char** argv) {
     int max_blocks_per_sm = INT_MAX;
     for (int i = 0; i < 2; i++) {
       HIPCHECK(hipSetDevice((d + i)));
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
       HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-              &max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block,
+              &max_blocks_per_sm_arr[i], test_kernel_used, num_threads_in_block,
               0));
       if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
         max_blocks_per_sm = max_blocks_per_sm_arr[i];
@@ -319,11 +395,12 @@ int main(int argc, char** argv) {
     void *dev_params[2][4];
     hipLaunchParams md_params[2];
     for (int i = 0; i < 2; i++) {
+      auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
       dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
       dev_params[i][1] = reinterpret_cast<void*>(&global_array);
       dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
       dev_params[i][3] = reinterpret_cast<void*>(&loops);
-      md_params[i].func = reinterpret_cast<void*>(test_kernel);
+      md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
       md_params[i].gridDim = requested_blocks;
       md_params[i].blockDim = num_threads_in_block;
       md_params[i].sharedMem = 0;
diff --git a/tests/src/runtimeApi/event/hipEventMultiThreaded.cpp b/tests/src/runtimeApi/event/hipEventMultiThreaded.cpp
index 2bf5a78969..b6e0e1c3fd 100644
--- a/tests/src/runtimeApi/event/hipEventMultiThreaded.cpp
+++ b/tests/src/runtimeApi/event/hipEventMultiThreaded.cpp
@@ -44,10 +44,20 @@ extern "C" __global__ void WaitKernel(int *Ad, int clockrate) {
   *Ad = 1;
 }
 
+extern "C" __global__ void WaitKernel_gfx11(int *Ad, int clockrate) {
+#ifdef __HIP_PLATFORM_AMD__
+  uint64_t wait_t = 500,
+  start = wall_clock64()/clockrate, cycles;
+  do { cycles = wall_clock64()/clockrate-start;} while (cycles < wait_t);
+  *Ad = 1;
+#endif
+}
+
 void t1(hipEvent_t start, hipStream_t stream1, int clkRate, int *A, int *Ad) {
   *A = 0;
 
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(WaitKernel), dim3(1), dim3(1), 0, stream1, Ad, clkRate);
+  auto WaitKernel_used = IsGfx11() ? WaitKernel_gfx11 : WaitKernel;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(WaitKernel_used), dim3(1), dim3(1), 0, stream1, Ad, clkRate);
 
   HIPCHECK(hipEventRecord(start, stream1));
 
@@ -69,7 +79,11 @@ int main(int argc, char* argv[]) {
     HIPCHECK(hipHostGetDevicePointer((void**)&Ad[i], A[i], 0));
   }
 
-  HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  }
   hipStream_t stream1;
   hipStreamCreate(&stream1);
   hipEvent_t start;
diff --git a/tests/src/runtimeApi/module/hipExtLaunchKernelGGL.cpp b/tests/src/runtimeApi/module/hipExtLaunchKernelGGL.cpp
index 4df6bd8600..d0ddb04cb6 100644
--- a/tests/src/runtimeApi/module/hipExtLaunchKernelGGL.cpp
+++ b/tests/src/runtimeApi/module/hipExtLaunchKernelGGL.cpp
@@ -62,6 +62,34 @@ __global__ void FourSecKernel(int clockrate) {
   }
 }
 
+__global__ void TwoSecKernel_gfx11(int clockrate) {
+#ifdef __HIP_PLATFORM_AMD__
+  if (globalvar == 0x2222) {
+    globalvar = 0x3333;
+  }
+  uint64_t wait_t = 2000,
+  start = wall_clock64()/clockrate, cur;
+  do { cur = (wall_clock64()/clockrate)-start;}while (cur < wait_t);
+  if (globalvar != 0x3333) {
+    globalvar = 0x5555;
+  }
+#endif
+}
+
+__global__ void FourSecKernel_gfx11(int clockrate) {
+#ifdef __HIP_PLATFORM_AMD__
+  if (globalvar == 1) {
+    globalvar = 0x2222;
+  }
+  uint64_t wait_t = 4000,
+  start = wall_clock64()/clockrate, cur;
+  do { cur = (wall_clock64()/clockrate)-start;}while (cur < wait_t);
+  if (globalvar == 0x2222) {
+    globalvar = 0x4444;
+  }
+#endif
+}
+
 /*
  * In this Scenario, we create events by disabling the timing flag
  * We then Launch the kernel using hipExtModuleLaunchKernel by passing
@@ -76,13 +104,20 @@ bool DisableTimeFlag() {
   float time_4sec, time_2sec;
   hipEvent_t  start_event1, end_event1;
   int clkRate = 0;
-  HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  }
+
   HIPCHECK(hipEventCreateWithFlags(&start_event1,
                                    hipEventDisableTiming));
   HIPCHECK(hipEventCreateWithFlags(&end_event1,
                                    hipEventDisableTiming));
   HIPCHECK(hipStreamCreate(&stream1));
-  hipExtLaunchKernelGGL((TwoSecKernel), dim3(1), dim3(1), 0,
+  auto TwoSecKernel_used = IsGfx11() ? TwoSecKernel_gfx11 : TwoSecKernel;
+  hipExtLaunchKernelGGL((TwoSecKernel_used), dim3(1), dim3(1), 0,
       stream1, start_event1, end_event1, 0, clkRate);
   HIPCHECK(hipStreamSynchronize(stream1));
   e = hipEventElapsedTime(&time_2sec, start_event1, end_event1);
@@ -113,11 +148,20 @@ bool ConcurencyCheck_GlobalVar(int conc_flag) {
   int deviceGlobal_h = 0;
   HIPCHECK(hipSetDevice(0));
   int clkRate = 0;
-  HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  }
+
   HIPCHECK(hipStreamCreate(&stream1));
-  hipExtLaunchKernelGGL((FourSecKernel), dim3(1), dim3(1), 0,
+  auto TwoSecKernel_used = IsGfx11() ? TwoSecKernel_gfx11 : TwoSecKernel;
+  auto FourSecKernel_used = IsGfx11() ? FourSecKernel_gfx11 : FourSecKernel;
+
+  hipExtLaunchKernelGGL((FourSecKernel_used), dim3(1), dim3(1), 0,
                          stream1, nullptr, nullptr, conc_flag, clkRate);
-  hipExtLaunchKernelGGL((TwoSecKernel), dim3(1), dim3(1), 0,
+  hipExtLaunchKernelGGL((TwoSecKernel_used), dim3(1), dim3(1), 0,
                          stream1, nullptr, nullptr, conc_flag, clkRate);
   HIPCHECK(hipStreamSynchronize(stream1));
   HIPCHECK(hipMemcpyFromSymbol(&deviceGlobal_h, globalvar,
@@ -148,16 +192,24 @@ bool KernelTimeExecution() {
   hipEvent_t  start_event1, end_event1, start_event2, end_event2;
   float time_4sec, time_2sec;
   int clkRate = 0;
-  HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  }
+
+  auto TwoSecKernel_used = IsGfx11() ? TwoSecKernel_gfx11 : TwoSecKernel;
+  auto FourSecKernel_used = IsGfx11() ? FourSecKernel_gfx11 : FourSecKernel;
 
   HIPCHECK(hipEventCreate(&start_event1));
   HIPCHECK(hipEventCreate(&end_event1));
   HIPCHECK(hipEventCreate(&start_event2));
   HIPCHECK(hipEventCreate(&end_event2));
   HIPCHECK(hipStreamCreate(&stream1));
-  hipExtLaunchKernelGGL((FourSecKernel), dim3(1), dim3(1), 0,
+  hipExtLaunchKernelGGL((FourSecKernel_used), dim3(1), dim3(1), 0,
                          stream1, start_event1, end_event1, 0, clkRate);
-  hipExtLaunchKernelGGL((TwoSecKernel), dim3(1), dim3(1), 0,
+  hipExtLaunchKernelGGL((TwoSecKernel_used), dim3(1), dim3(1), 0,
                           stream1, start_event2, end_event2, 0, clkRate);
   HIPCHECK(hipStreamSynchronize(stream1));
   e = hipEventElapsedTime(&time_4sec, start_event1, end_event1);
diff --git a/tests/src/runtimeApi/module/hipExtModuleLaunchKernel.cpp b/tests/src/runtimeApi/module/hipExtModuleLaunchKernel.cpp
index 866332893b..65be909591 100644
--- a/tests/src/runtimeApi/module/hipExtModuleLaunchKernel.cpp
+++ b/tests/src/runtimeApi/module/hipExtModuleLaunchKernel.cpp
@@ -49,10 +49,7 @@
 
 #define fileName "matmul.code"
 #define matmulK "matmulK"
-#define SixteenSec "SixteenSecKernel"
 #define KernelandExtra "KernelandExtraParams"
-#define FourSec "FourSecKernel"
-#define TwoSec "TwoSecKernel"
 #define globalDevVar "deviceGlobal"
 #define dummyKernel "dummyKernel"
 #define FOURSEC_KERNEL 4999
@@ -124,7 +121,12 @@ void ModuleLaunchKernel::AllocateMemory() {
   HIPCHECK(hipMemcpy(Ad, A, SIZE*sizeof(int), hipMemcpyHostToDevice));
   HIPCHECK(hipMemcpy(Bd, B, SIZE*sizeof(int), hipMemcpyHostToDevice));
   int clkRate = 0;
-  HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  }
+
   args1._Ad = Ad;
   args1._Bd = Bd;
   args1._Cd = C;
@@ -149,13 +151,21 @@ void ModuleLaunchKernel::AllocateMemory() {
 }
 
 void ModuleLaunchKernel::ModuleLoad() {
+
+  std::string TwoSecStr = IsGfx11() ? std::string("TwoSecKernel_gfx11")
+                                      : std::string("TwoSecKernel");
+  std::string FourSecStr = IsGfx11() ? std::string("FourSecKernel_gfx11")
+                                       : std::string("FourSecKernel");
+  std::string SixteenSecStr = IsGfx11() ? std::string("SixteenSecKernel_gfx11")
+                                          : std::string("SixteenSecKernel");
+
   HIPCHECK(hipModuleLoad(&Module, fileName));
   HIPCHECK(hipModuleGetFunction(&MultKernel, Module, matmulK));
-  HIPCHECK(hipModuleGetFunction(&SixteenSecKernel, Module, SixteenSec));
+  HIPCHECK(hipModuleGetFunction(&SixteenSecKernel, Module, SixteenSecStr.c_str()));
   HIPCHECK(hipModuleGetFunction(&KernelandExtraParamKernel,
                                 Module, KernelandExtra));
-  HIPCHECK(hipModuleGetFunction(&FourSecKernel, Module, FourSec));
-  HIPCHECK(hipModuleGetFunction(&TwoSecKernel, Module, TwoSec));
+  HIPCHECK(hipModuleGetFunction(&FourSecKernel, Module, FourSecStr.c_str()));
+  HIPCHECK(hipModuleGetFunction(&TwoSecKernel, Module, TwoSecStr.c_str()));
   HIPCHECK(hipModuleGetFunction(&DummyKernel, Module, dummyKernel));
   HIPCHECK(hipModuleGetGlobal(&deviceGlobal, &deviceGlobalSize,
                               Module, globalDevVar));
diff --git a/tests/src/runtimeApi/module/matmul.cpp b/tests/src/runtimeApi/module/matmul.cpp
index 08a5a11018..de7c935f53 100644
--- a/tests/src/runtimeApi/module/matmul.cpp
+++ b/tests/src/runtimeApi/module/matmul.cpp
@@ -79,5 +79,41 @@ extern "C" __global__ void FourSecKernel(int clockrate) {
   }
 }
 
+extern "C" __global__ void SixteenSecKernel_gfx11(int clockrate) {
+#ifdef __HIP_PLATFORM_AMD__
+  uint64_t wait_t = 16000,
+  start = wall_clock64()/clockrate, cur;
+  do { cur = wall_clock64()/clockrate-start;}while (cur < wait_t);
+#endif
+}
+
+extern "C" __global__ void TwoSecKernel_gfx11(int clockrate) {
+#ifdef __HIP_PLATFORM_AMD__
+  if (deviceGlobal == 0x2222) {
+    deviceGlobal = 0x3333;
+  }
+  uint64_t wait_t = 2000,
+  start = wall_clock64()/clockrate, cur;
+  do { cur = wall_clock64()/clockrate-start;}while (cur < wait_t);
+  if (deviceGlobal != 0x3333) {
+    deviceGlobal = 0x5555;
+  }
+#endif
+}
+
+extern "C" __global__ void FourSecKernel_gfx11(int clockrate) {
+#ifdef __HIP_PLATFORM_AMD__
+  if (deviceGlobal == 1) {
+    deviceGlobal = 0x2222;
+  }
+  uint64_t wait_t = 4000,
+  start = wall_clock64()/clockrate, cur;
+  do { cur = wall_clock64()/clockrate-start;}while (cur < wait_t);
+  if (deviceGlobal == 0x2222) {
+    deviceGlobal = 0x4444;
+  }
+#endif
+}
+
 extern "C" __global__ void dummyKernel() {
 }
diff --git a/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp b/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp
index 3188d305bc..694375925b 100644
--- a/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamACb_AltEnqueue.cpp
@@ -64,6 +64,25 @@ __global__ void vector_square(float* C_d, float* A_d, size_t Num) {
     }
 }
 
+__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t Num) {
+#ifdef __HIP_PLATFORM_AMD__
+  size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = gputhread; i < Num; i += stride) {
+      C_d[i] = A_d[i] * A_d[i];
+  }
+
+  // Delay thread 1 only in the GPU
+  if (gputhread == 1) {
+    unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur;
+      do {
+          cur = wall_clock64() - start;
+      } while (cur < wait_t);
+    }
+#endif
+}
+
 float *A_h, *C_h, *A_h1, *C_h1;
 
 static void HIPRT_CB Callback_Stream1(hipStream_t stream, hipError_t status,
@@ -129,8 +148,9 @@ int main(int argc, char* argv[]) {
   int *ptr = NULL;
   int *ptr1 = NULL;
   // Queing jobs in both mystream1/2 followed by hipStreamAddCallback
+  auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square;
   for (int i = 1; i < 5; ++i) {
-    hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock),
+    hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock),
                        0, mystream1, C_d, A_d, Num);
     HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost,
                             mystream1));
@@ -139,7 +159,7 @@ int main(int argc, char* argv[]) {
     HIPCHECK(hipStreamAddCallback(mystream1, Callback_Stream1,
                                   reinterpret_cast<void*>(ptr), 0));
 
-    hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock),
+    hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock),
                        0, mystream2, C_d, A_d, Num);
     HIPCHECK(hipMemcpyAsync(C_h1, C_d, Nbytes,
                             hipMemcpyDeviceToHost, mystream2));
diff --git a/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp b/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp
index 15e149a3e6..86dd214361 100644
--- a/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamACb_MultiCalls.cpp
@@ -64,6 +64,25 @@ __global__ void vector_square(float* C_d, float* A_d, size_t Num) {
   }
 }
 
+__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t Num) {
+#ifdef __HIP_PLATFORM_AMD__
+  size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = gputhread; i < Num; i += stride) {
+    C_d[i] = A_d[i] * A_d[i];
+  }
+
+  // Delay thread 1 only in the GPU
+  if (gputhread == 1) {
+    unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur;
+    do {
+      cur = wall_clock64() - start;
+    } while (cur < wait_t);
+  }
+#endif
+}
+
 static void HIPRT_CB Stream_Callback(hipStream_t stream, hipError_t status,
                                      void* userData) {
   for (size_t i = 0; i < Num; i++) {
@@ -100,7 +119,8 @@ int main(int argc, char* argv[]) {
 
   const unsigned threadsPerBlock = 256;
   const unsigned blocks = (Num+255)/threadsPerBlock;
-  hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0,
+  auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square;
+  hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0,
                       mystream, C_d, A_d, Num);
 
   HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
diff --git a/tests/src/runtimeApi/stream/hipStreamACb_MultiThread.cpp b/tests/src/runtimeApi/stream/hipStreamACb_MultiThread.cpp
index 704621761e..e6b702fa3b 100644
--- a/tests/src/runtimeApi/stream/hipStreamACb_MultiThread.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamACb_MultiThread.cpp
@@ -64,6 +64,24 @@ __global__ void vector_square(float* C_d, float* A_d, size_t Num) {
   }
 }
 
+__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t Num) {
+#ifdef __HIP_PLATFORM_AMD__
+  size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = gputhread; i < Num; i += stride) {
+    C_d[i] = A_d[i] * A_d[i];
+  }
+
+  // Delay thread 1 only in the GPU
+  if (gputhread == 1) {
+    unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur;
+    do {
+      cur = wall_clock64() - start;
+    } while (cur < wait_t);
+  }
+#endif
+}
 
 static void HIPRT_CB Thread1_Callback(hipStream_t stream, hipError_t status,
                                       void* userData) {
@@ -124,7 +142,8 @@ int main(int argc, char* argv[]) {
   const unsigned threadsPerBlock = 256;
   const unsigned blocks = (Num+255)/threadsPerBlock;
 
-  hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0,
+  auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square;
+  hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0,
                       mystream, C_d, A_d, Num);
 
   HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
diff --git a/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp b/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp
index 14d9eab597..d079e6eb38 100644
--- a/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp
+++ b/tests/src/runtimeApi/stream/hipStreamACb_StrmSyncTiming.cpp
@@ -61,6 +61,25 @@ __global__ void vector_square(float* C_d, float* A_d, size_t N_elmts) {
   }
 }
 
+__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t N_elmts) {
+#ifdef __HIP_PLATFORM_AMD__
+  size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = offset; i < N_elmts; i += stride) {
+    C_d[i] = A_d[i] * A_d[i];
+  }
+
+  // Delay the thread 1
+  if (offset == 1) {
+    unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur;
+    do {
+      cur = wall_clock64() - start;
+    } while (cur < wait_t);
+  }
+#endif
+}
+
 float *A_h, *C_h;
 
 static void HIPRT_CB Callback1(hipStream_t stream, hipError_t status,
@@ -102,7 +121,8 @@ int main(int argc, char* argv[]) {
   const unsigned threadsPerBlock = 256;
   const unsigned blocks = (N_elmts + 255)/threadsPerBlock;
 
-  hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0,
+  auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square;
+  hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0,
                       mystream, C_d, A_d, N_elmts);
   HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
   HIPCHECK(hipStreamAddCallback(mystream, Callback1, NULL, 0));
diff --git a/tests/src/test_common.cpp b/tests/src/test_common.cpp
index 03464fcba5..ec76df0427 100644
--- a/tests/src/test_common.cpp
+++ b/tests/src/test_common.cpp
@@ -92,6 +92,32 @@ size_t getHostThreadCount(const size_t memPerThread, const size_t maxThreads) {
   return thread_count;
 }
 
+// Function to determine if the device is of gfx11 architecture
+bool IsGfx11() {
+#if defined(__HIP_PLATFORM_NVIDIA__)
+  return false;
+#elif defined(__HIP_PLATFORM_AMD__)
+  int device = -1;
+  hipDeviceProp_t props{};
+  HIPCHECK(hipGetDevice(&device));
+  HIPCHECK(hipGetDeviceProperties(&props, device));
+
+  // Get GCN Arch Name and compare to check if it is gfx11
+  std::string arch = std::string(props.gcnArchName);
+  auto pos = arch.find(":");
+  if (pos != std::string::npos)
+    arch = arch.substr(0, pos);
+
+  if(arch.size() >= 5)
+    arch = arch.substr(0,5);
+
+  return (arch == std::string("gfx11")) ? true : false;
+#else
+  std::cout<<"Have to be either Nvidia or AMD platform, asserting"<<std::endl;
+  assert(false);
+#endif
+}
+
 namespace HipTest {
 
 
@@ -232,5 +258,4 @@ unsigned setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N)
     return blocks;
 }
 
-
 }  // namespace HipTest
diff --git a/tests/src/test_common.h b/tests/src/test_common.h
index 21e82a71ca..853ab1f0da 100644
--- a/tests/src/test_common.h
+++ b/tests/src/test_common.h
@@ -581,3 +581,6 @@ inline bool isImageSupported() {
     if (!HipTest::isImageSupported())                                                      \
         { printf("Texture is not support on the device. Skipped.\n"); passed(); }
 #endif //__cplusplus
+
+// Function to determine if the device is of gfx11 architecture
+bool IsGfx11();
\ No newline at end of file