From d8b7cb28ff91e930fea1effd1003285eec326c9a Mon Sep 17 00:00:00 2001
From: ROCm CI Service Account <66695075+rocm-ci@users.noreply.github.com>
Date: Sat, 7 Jan 2023 04:35:21 +0530
Subject: [PATCH] SWDEV-337452 - Changing Clock64 to WallClock64 in tests for
 gfx11. (#78)

Change-Id: I484fe9ff7cd56c70a37a3ac5a4a55812f8557259


[ROCm/hip-tests commit: 87fac87657e661f44ddf5718ffbdaf61a452f21e]
---
 .../catch/include/hip_test_common.hh          |  42 ++++++-
 projects/hip-tests/catch/include/utils.hh     |   6 +-
 .../multiproc/hipMemCoherencyTstMProc.cc      |  38 +++++-
 .../catch/unit/event/Unit_hipEventQuery.cc    |  17 ++-
 .../hipGraphExecEventWaitNodeSetEvent.cc      |  19 ++-
 .../catch/unit/memory/hipMemPoolApi.cc        | 110 +++++++++++++++---
 .../unit/memory/hipMemoryAllocateCoherent.cc  |  24 +++-
 .../unit/stream/hipStreamACb_MultiThread.cc   |  21 +++-
 .../catch/unit/stream/hipStreamWaitEvent.cc   |  23 +++-
 .../streamperthread/hipStreamPerThrdTsts.cc   |  89 ++++++++++++--
 10 files changed, 339 insertions(+), 50 deletions(-)

diff --git a/projects/hip-tests/catch/include/hip_test_common.hh b/projects/hip-tests/catch/include/hip_test_common.hh
index 5f7b197596..a7be3baca9 100644
--- a/projects/hip-tests/catch/include/hip_test_common.hh
+++ b/projects/hip-tests/catch/include/hip_test_common.hh
@@ -140,6 +140,31 @@ static void initHipCtx(hipCtx_t* pcontext) {
 #define HIP_ARRAY hipArray*
 #endif
 
+static inline bool IsGfx11() {
+#if defined(HT_NVIDIA)
+  return false;
+#elif defined(HT_AMD)
+  int device = -1;
+  hipDeviceProp_t props{};
+  HIP_CHECK(hipGetDevice(&device));
+  HIP_CHECK(hipGetDeviceProperties(&props, device));
+
+   // Get GCN Arch Name and compare to check if it is gfx11
+  std::string arch = std::string(props.gcnArchName);
+  auto pos = arch.find(":");
+  if (pos != std::string::npos)
+    arch = arch.substr(0, pos);
+
+  if(arch.size() >= 5)
+    arch = arch.substr(0,5);
+
+  return (arch == std::string("gfx11")) ? true : false;
+#else
+  std::cout<<"Have to be either Nvidia or AMD platform, asserting"<<std::endl;
+  assert(false);
+#endif
+}
+
 
 // Utility Functions
 namespace HipTest {
@@ -335,6 +360,14 @@ static __global__ void waitKernel(clock_t offset) {
   }
 }
 
+static __global__ void waitKernel_gfx11(clock_t offset) {
+#if HT_AMD
+  auto start = wall_clock64();
+  while ((wall_clock64() - start) < offset) {
+  }
+#endif
+}
+
 // helper function used to set the device frequency variable
 // estimates the number of clock ticks in 1 second
 static size_t findTicksPerSecond() {
@@ -350,9 +383,9 @@ static size_t findTicksPerSecond() {
   hipEvent_t start, stop;
   HIP_CHECK(hipEventCreate(&start));
   HIP_CHECK(hipEventCreate(&stop));
-
+  auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
   // Warmup
-  hipLaunchKernelGGL(waitKernel, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
+  hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
   HIP_CHECK(hipGetLastError());
   HIP_CHECK(hipDeviceSynchronize());
 
@@ -360,7 +393,7 @@ static size_t findTicksPerSecond() {
   // after 10 attempts the result is likely good enough so just accept it
   for (int attempts = 10; attempts > 0; --attempts) {
     HIP_CHECK(hipEventRecord(start));
-    hipLaunchKernelGGL(waitKernel, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
+    hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
     HIP_CHECK(hipEventRecord(stop));
     HIP_CHECK(hipGetLastError());
     HIP_CHECK(hipEventSynchronize(stop));
@@ -396,7 +429,8 @@ static inline void runKernelForDuration(std::chrono::milliseconds duration,
   // precision so that's acceptable.
   static size_t ticksPerSecond = findTicksPerSecond();
   const auto millis = duration.count();
-  hipLaunchKernelGGL(waitKernel, dim3(1), dim3(1), 0, stream, ticksPerSecond * millis / 1000);
+  auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
+  hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, stream, ticksPerSecond * millis / 1000);
 }
 
 }  // namespace HipTest
diff --git a/projects/hip-tests/catch/include/utils.hh b/projects/hip-tests/catch/include/utils.hh
index f3d6debb26..bb7fce06e4 100644
--- a/projects/hip-tests/catch/include/utils.hh
+++ b/projects/hip-tests/catch/include/utils.hh
@@ -128,7 +128,11 @@ __global__ void Iota(T* const out, size_t pitch, size_t w, size_t h, size_t d) {
 inline void LaunchDelayKernel(const std::chrono::milliseconds interval, const hipStream_t stream) {
   int ticks_per_ms = 0;
   // Clock rate is in kHz => number of clock ticks in a millisecond
-  HIP_CHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeClockRate, 0));
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeClockRate, 0));
+  }
   Delay<<<1, 1, 0, stream>>>(interval.count(), ticks_per_ms);
   HIP_CHECK(hipGetLastError());
 }
diff --git a/projects/hip-tests/catch/multiproc/hipMemCoherencyTstMProc.cc b/projects/hip-tests/catch/multiproc/hipMemCoherencyTstMProc.cc
index 863dfc7178..8579aabc27 100644
--- a/projects/hip-tests/catch/multiproc/hipMemCoherencyTstMProc.cc
+++ b/projects/hip-tests/catch/multiproc/hipMemCoherencyTstMProc.cc
@@ -49,6 +49,19 @@ __global__  void CoherentTst(int *ptr, int PeakClk) {
   }
 }
 
+__global__  void CoherentTst_gfx11(int *ptr, int PeakClk) {
+#if HT_AMD
+  // Incrementing the value by 1
+  int64_t GpuFrq = int64_t(PeakClk) * 1000;
+  int64_t StrtTck = wall_clock64();
+  atomicAdd(ptr, 1);
+  // The following while loop checks the value in ptr for around 3-4 seconds
+  while ((wall_clock64() - StrtTck) <= (3 * GpuFrq)) {
+    if (atomicCAS(ptr, 3, 4) == 3) break;
+  }
+#endif
+}
+
 __global__  void SquareKrnl(int *ptr) {
   // ptr value squared here
   *ptr = (*ptr) * (*ptr);
@@ -64,14 +77,27 @@ static void TstCoherency(int *Ptr, bool HmmMem) {
   HIP_CHECK(hipStreamCreate(&strm));
   // storing value 1 in the memory created above
   *Ptr = 1;
+
   // Getting gpu frequency
-  HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
-  if (!HmmMem) {
-    HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void **>(&Dptr), Ptr,
-                                      0));
-    CoherentTst<<<1, 1, 0, strm>>>(Dptr, peak_clk);
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
   } else {
-    CoherentTst<<<1, 1, 0, strm>>>(Ptr, peak_clk);
+    HIPCHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
+  }
+
+  if (!HmmMem) {
+    HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void **>(&Dptr), Ptr, 0));
+    if (IsGfx11()) {
+      CoherentTst_gfx11<<<1, 1, 0, strm>>>(Dptr, peak_clk);
+    } else {
+      CoherentTst<<<1, 1, 0, strm>>>(Dptr, peak_clk);
+    }
+  } else {
+    if (IsGfx11()) {
+      CoherentTst_gfx11<<<1, 1, 0, strm>>>(Ptr, peak_clk);
+    } else {
+      CoherentTst<<<1, 1, 0, strm>>>(Ptr, peak_clk);
+    }
   }
   // looping until the value is 2 for 3 seconds
   std::chrono::steady_clock::time_point start =
diff --git a/projects/hip-tests/catch/unit/event/Unit_hipEventQuery.cc b/projects/hip-tests/catch/unit/event/Unit_hipEventQuery.cc
index d355a515e5..47d8995c18 100644
--- a/projects/hip-tests/catch/unit/event/Unit_hipEventQuery.cc
+++ b/projects/hip-tests/catch/unit/event/Unit_hipEventQuery.cc
@@ -34,6 +34,20 @@ __global__ void waitKernel(int clockRate, int seconds) {
   }
 }
 
+__global__ void waitKernel_gfx11(int clockRate, int seconds) {
+#if HT_AMD
+  auto start = wall_clock64();
+  auto ms = seconds * 1000;
+  long long waitTill = clockRate * (long long)ms;
+  while (1) {
+    auto end = wall_clock64();
+    if ((end - start) > waitTill) {
+      return;
+    }
+  }
+#endif
+}
+
 TEST_CASE("Unit_hipEventQuery_DifferentDevice") {
   hipEvent_t event1{}, event2{};
   HIP_CHECK(hipEventCreate(&event1));
@@ -54,9 +68,10 @@ TEST_CASE("Unit_hipEventQuery_DifferentDevice") {
     HIP_CHECK(hipSetDevice(0));
     HIP_CHECK(hipEventRecord(event1, stream));
 
+    auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
     // Start kernel and wait for 3 seconds
     // Make sure you increase this time if you add more tests here
-    waitKernel<<<1, 1, 0, stream>>>(clockRate, 3);
+    waitKernel_used<<<1, 1, 0, stream>>>(clockRate, 3);
 
     HIP_CHECK(hipEventRecord(event2, stream));
 
diff --git a/projects/hip-tests/catch/unit/graph/hipGraphExecEventWaitNodeSetEvent.cc b/projects/hip-tests/catch/unit/graph/hipGraphExecEventWaitNodeSetEvent.cc
index 5a0d20f541..f8ee63d6f8 100644
--- a/projects/hip-tests/catch/unit/graph/hipGraphExecEventWaitNodeSetEvent.cc
+++ b/projects/hip-tests/catch/unit/graph/hipGraphExecEventWaitNodeSetEvent.cc
@@ -67,6 +67,16 @@ static __global__ void sqr_ker_func(int* a, int* b, int clockrate) {
   do { cur = clock64()/clockrate - start;}while (cur < wait_t);
 }
 
+static __global__ void sqr_ker_func_gfx11(int* a, int* b, int clockrate) {
+#if HT_AMD
+  int tx = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
+  if (tx < LEN) b[tx] = a[tx]*a[tx];
+  uint64_t wait_t = DELAY_IN_MS,
+  start = wall_clock64()/clockrate, cur;
+  do { cur = wall_clock64()/clockrate - start;}while (cur < wait_t);
+#endif
+}
+
 /**
  * Scenario 1: Test to validate setting different events in executable graph.
  */
@@ -106,10 +116,15 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
                 inp_h, memsize, hipMemcpyHostToDevice));
   // Get device clock rate
   int clkRate = 0;
-  HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  }
   // kernel1
+  auto sqr_ker_func_used = IsGfx11() ? sqr_ker_func_gfx11 : sqr_ker_func;
   void* kernelArgs[] = {&inp_d, &out_d, reinterpret_cast<void *>(&clkRate)};
-  kernelNodeParams1.func = reinterpret_cast<void *>(sqr_ker_func);
+  kernelNodeParams1.func = reinterpret_cast<void *>(sqr_ker_func_used);
   kernelNodeParams1.gridDim = dim3(GRID_DIM);
   kernelNodeParams1.blockDim = dim3(BLK_DIM);
   kernelNodeParams1.sharedMemBytes = 0;
diff --git a/projects/hip-tests/catch/unit/memory/hipMemPoolApi.cc b/projects/hip-tests/catch/unit/memory/hipMemPoolApi.cc
index b09d2432f4..927899f1b7 100644
--- a/projects/hip-tests/catch/unit/memory/hipMemPoolApi.cc
+++ b/projects/hip-tests/catch/unit/memory/hipMemPoolApi.cc
@@ -123,6 +123,21 @@ __global__ void kernel500ms(float* hostRes, int clkRate) {
   }
 }
 
+__global__ void kernel500ms_gfx11(float* hostRes, int clkRate) {
+#if HT_AMD
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  hostRes[tid] = tid + 1;
+  __threadfence_system();
+  // expecting that the data is getting flushed to host here!
+  uint64_t start = wall_clock64()/clkRate, cur;
+  if (clkRate > 1) {
+    do { cur = wall_clock64()/clkRate-start;}while (cur < wait_ms);
+  } else {
+    do { cur = wall_clock64()/start;}while (cur < wait_ms);
+  }
+#endif
+}
+
 TEST_CASE("Unit_hipMemPoolApi_BasicAlloc") {
   int mem_pool_support = 0;
   HIP_CHECK(hipDeviceGetAttribute(&mem_pool_support, hipDeviceAttributeMemoryPoolsSupported, 0));
@@ -147,9 +162,14 @@ TEST_CASE("Unit_hipMemPoolApi_BasicAlloc") {
   int blocks = 1024;
   int clkRate;
   hipMemPoolAttr attr;
-  HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  if (IsGfx11()) {
+    HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+    kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
+  } else {
+    HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
 
-  kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+    kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+  }
 
   HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(B), stream));
 
@@ -229,9 +249,14 @@ TEST_CASE("Unit_hipMemPoolApi_BasicTrim") {
 
   int blocks = 2;
   int clkRate;
-  HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  if (IsGfx11()) {
+    HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+    kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
+  } else {
+    HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
 
-  kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+    kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+  }
 
   hipMemPoolAttr attr;
   attr = hipMemPoolAttrReleaseThreshold;
@@ -312,9 +337,15 @@ TEST_CASE("Unit_hipMemPoolApi_BasicReuse") {
 
   int blocks = 2;
   int clkRate;
-  HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
 
-  kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+  if (IsGfx11()) {
+    HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+    kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
+  } else {
+    HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+
+    kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+  }
 
   hipMemPoolAttr attr;
   // Not a real free, since kernel isn't done
@@ -329,7 +360,11 @@ TEST_CASE("Unit_hipMemPoolApi_BasicReuse") {
   HIP_CHECK(hipStreamSynchronize(stream));
 
   // Second kernel launch with new memory
-  kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+  if (IsGfx11()) {
+    kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
+  } else {
+    kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+  }
 
   HIP_CHECK(hipStreamSynchronize(stream));
 
@@ -369,7 +404,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
   hipMemPoolAttr attr;
   int blocks = 2;
   int clkRate;
-  HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+   if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  }
 
   float *A, *B, *C;
   hipStream_t stream, stream2;
@@ -395,7 +434,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
     HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value));
 
     // Run kernel for 500 ms in the first stream
-    kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+    if (IsGfx11()) {
+      kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
+    } else {
+      kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+    }
 
     // Not a real free, since kernel isn't done
     HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(A), stream));
@@ -410,7 +453,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
     REQUIRE(A != B);
 
     // Run kernel with the new memory in the second stream
-    kernel500ms<<<32, blocks, 0, stream2>>>(B, clkRate);
+    if (IsGfx11()) {
+      kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
+    } else {
+      kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+    }
 
     HIP_CHECK(hipStreamSynchronize(stream));
     HIP_CHECK(hipStreamSynchronize(stream2));
@@ -428,7 +475,13 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
     HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value));
 
     // Run kernel for 500 ms in the first stream
-    kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+    if (IsGfx11()) {
+      HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+      kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
+    } else {
+      HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+      kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+    }
 
     // Not a real free, since kernel isn't done
     HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(A), stream));
@@ -443,7 +496,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
     REQUIRE(A == B);
 
     // Run kernel with the new memory in the second stream
-    kernel500ms<<<32, blocks, 0, stream2>>>(B, clkRate);
+    if (IsGfx11()) {
+      kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
+    } else {
+      kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+    }
 
     HIP_CHECK(hipStreamSynchronize(stream));
     HIP_CHECK(hipStreamSynchronize(stream2));
@@ -461,7 +518,12 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
     HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value));
 
     // Run kernel for 500 ms in the first stream
-    kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+      
+    if (IsGfx11()) {
+      kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
+    } else {
+      kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+    }
 
     // Not a real free, since kernel isn't done
     HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(A), stream));
@@ -473,7 +535,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
     REQUIRE(A != B);
 
     // Run kernel with the new memory in the second stream
-    kernel500ms<<<32, blocks, 0, stream2>>>(B, clkRate);
+    if (IsGfx11()) {
+      kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
+    } else {
+      kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+    }
 
     HIP_CHECK(hipStreamSynchronize(stream));
     HIP_CHECK(hipStreamSynchronize(stream2));
@@ -510,9 +576,15 @@ TEST_CASE("Unit_hipMemPoolApi_Default") {
 
   int blocks = 2;
   int clkRate;
-  HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+    
+  if (IsGfx11()) {
+    HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+    kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
+  } else {
+    HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
 
-  kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+    kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+  }
 
   hipMemPoolAttr attr;
   // Not a real free, since kernel isn't done
@@ -527,7 +599,11 @@ TEST_CASE("Unit_hipMemPoolApi_Default") {
   HIP_CHECK(hipStreamSynchronize(stream));
 
   // Second kernel launch with new memory
-  kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+  if (IsGfx11()) {
+    kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
+  } else {
+    kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+  }
 
   HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(B), stream));
 
diff --git a/projects/hip-tests/catch/unit/memory/hipMemoryAllocateCoherent.cc b/projects/hip-tests/catch/unit/memory/hipMemoryAllocateCoherent.cc
index 75c6c87b4e..e9fed1fc84 100644
--- a/projects/hip-tests/catch/unit/memory/hipMemoryAllocateCoherent.cc
+++ b/projects/hip-tests/catch/unit/memory/hipMemoryAllocateCoherent.cc
@@ -41,6 +41,21 @@ __global__ void Kernel(float* hostRes, int clkRate) {
   }
 }
 
+__global__ void Kernel_gfx11(float* hostRes, int clkRate) {
+#if HT_AMD
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  hostRes[tid] = tid + 1;
+  __threadfence_system();
+  // expecting that the data is getting flushed to host here!
+  uint64_t start = wall_clock64()/clkRate, cur;
+  if (clkRate > 1) {
+    do { cur = wall_clock64()/clkRate-start;}while (cur < wait_sec);
+  } else {
+    do { cur = wall_clock64()/start;}while (cur < wait_sec);
+  }
+#endif
+}
+
 TEST_CASE("Unit_hipHostMalloc_CoherentAccess") {
   int blocks = 2;
   float* hostRes;
@@ -49,9 +64,14 @@ TEST_CASE("Unit_hipHostMalloc_CoherentAccess") {
   hostRes[0] = 0;
   hostRes[1] = 0;
   int clkRate;
-  HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  }
   std::cout << clkRate << std::endl;
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(Kernel), dim3(1), dim3(blocks),
+  auto Kernel_used = IsGfx11() ? Kernel_gfx11 : Kernel;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(Kernel_used), dim3(1), dim3(blocks),
                      0, 0, hostRes, clkRate);
   HIP_CHECK(hipGetLastError());
   int eleCounter = 0;
diff --git a/projects/hip-tests/catch/unit/stream/hipStreamACb_MultiThread.cc b/projects/hip-tests/catch/unit/stream/hipStreamACb_MultiThread.cc
index 09946ced5d..2825914cba 100644
--- a/projects/hip-tests/catch/unit/stream/hipStreamACb_MultiThread.cc
+++ b/projects/hip-tests/catch/unit/stream/hipStreamACb_MultiThread.cc
@@ -53,6 +53,24 @@ static __global__ void device_function(float* C_d, float* A_d, size_t Num) {
   }
 }
 
+static __global__ void device_function_gfx11(float* C_d, float* A_d, size_t Num) {
+#if HT_AMD
+  size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = gputhread; i < Num; i += stride) {
+    C_d[i] = A_d[i] * A_d[i];
+  }
+
+  // Delay thread 1 only in the GPU
+  if (gputhread == 1) {
+    uint64_t wait_t = 3200000000, start = wall_clock64(), cur;
+    do {
+      cur = wall_clock64() - start;
+    } while (cur < wait_t);
+  }
+#endif
+}
 
 static void HIPRT_CB Thread1_Callback(hipStream_t stream, hipError_t status,
                                       void* userData) {
@@ -128,7 +146,8 @@ TEST_CASE("Unit_hipStreamAddCallback_MultipleThreads") {
   constexpr unsigned threadsPerBlock = 256;
   constexpr unsigned blocks = (N + 255)/threadsPerBlock;
 
-  hipLaunchKernelGGL((device_function), dim3(blocks),
+  auto device_function_used = IsGfx11() ? device_function_gfx11 : device_function;
+  hipLaunchKernelGGL((device_function_used), dim3(blocks),
                      dim3(threadsPerBlock), 0,
                      mystream, C_d, A_d, N);
   HIP_CHECK(hipGetLastError());
diff --git a/projects/hip-tests/catch/unit/stream/hipStreamWaitEvent.cc b/projects/hip-tests/catch/unit/stream/hipStreamWaitEvent.cc
index d050efea2a..7965bc19ef 100644
--- a/projects/hip-tests/catch/unit/stream/hipStreamWaitEvent.cc
+++ b/projects/hip-tests/catch/unit/stream/hipStreamWaitEvent.cc
@@ -94,6 +94,20 @@ __global__ void waitKernel(int clockRate, int seconds) {
   }
 }
 
+__global__ void waitKernel_gfx11(int clockRate, int seconds) {
+#if HT_AMD
+  auto start = wall_clock64();
+  auto ms = seconds * 1000;
+  long long waitTill = clockRate * (long long)ms;
+  while (1) {
+    auto end = wall_clock64();
+    if ((end - start) > waitTill) {
+      return;
+    }
+  }
+#endif
+}
+
 TEST_CASE("Unit_hipStreamWaitEvent_Default") {
   hipStream_t stream{nullptr};
   hipEvent_t waitEvent{nullptr};
@@ -111,7 +125,8 @@ TEST_CASE("Unit_hipStreamWaitEvent_Default") {
   HIP_CHECK(hipGetDeviceProperties(&prop, deviceId));
   auto clockRate = prop.clockRate;
 
-  waitKernel<<<1, 1, 0, stream>>>(clockRate, 2);  // Wait for 2 seconds
+  auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
+  waitKernel_used<<<1, 1, 0, stream>>>(clockRate, 2);  // Wait for 2 seconds
 
   HIP_CHECK(hipEventRecord(waitEvent, stream));
 
@@ -145,8 +160,8 @@ TEST_CASE("Unit_hipStreamWaitEvent_DifferentStreams") {
   hipDeviceProp_t prop{};
   HIP_CHECK(hipGetDeviceProperties(&prop, deviceId));
   auto clockRate = prop.clockRate;
-
-  waitKernel<<<1, 1, 0, blockedStreamA>>>(clockRate,
+  auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
+  waitKernel_used<<<1, 1, 0, blockedStreamA>>>(clockRate,
                                           3);  // wait for 3 seconds
   HIP_CHECK(hipEventRecord(waitEvent, blockedStreamA));
 
@@ -155,7 +170,7 @@ TEST_CASE("Unit_hipStreamWaitEvent_DifferentStreams") {
 
   HIP_CHECK(hipStreamWaitEvent(streamBlockedOnStreamA, waitEvent, 0));
 
-  waitKernel<<<1, 1, 0, streamBlockedOnStreamA>>>(clockRate, 2);  // Wait for 2 seconds
+  waitKernel_used<<<1, 1, 0, streamBlockedOnStreamA>>>(clockRate, 2);  // Wait for 2 seconds
 
   HIP_CHECK(hipStreamSynchronize(unblockingStream));
 
diff --git a/projects/hip-tests/catch/unit/streamperthread/hipStreamPerThrdTsts.cc b/projects/hip-tests/catch/unit/streamperthread/hipStreamPerThrdTsts.cc
index dec35e100c..c14b38ce19 100644
--- a/projects/hip-tests/catch/unit/streamperthread/hipStreamPerThrdTsts.cc
+++ b/projects/hip-tests/catch/unit/streamperthread/hipStreamPerThrdTsts.cc
@@ -95,6 +95,39 @@ __global__ void StreamPerThrd1(int *A, int Pk_Clk) {
   *A = 1;
 }
 
+__global__ void StreamPerThrd_gfx11(int *Ad, int *Ad1, size_t n, int Pk_Clk,
+                              int Wait, int WaitEvnt  = 0) {
+#if HT_AMD
+  size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < n) {
+    Ad[index] = Ad[index] + 10;
+  }
+  if (Wait) {
+    int64_t GpuFrq = (Pk_Clk * 1000);
+    int64_t StrtTck = wall_clock64();
+    if (index == 0) {
+      // The following while loop checks the value in ptr for around 4 seconds
+      while ((wall_clock64() - StrtTck) <= (6 * GpuFrq)) {
+      }
+      if (WaitEvnt == 1) {
+        *Ad1 = 1;
+      }
+    }
+  }
+#endif
+}
+
+__global__ void StreamPerThrd1_gfx11(int *A, int Pk_Clk) {
+#if HT_AMD
+  int64_t GpuFrq = (Pk_Clk * 1000);
+  int64_t StrtTck = wall_clock64();
+  // The following while loop checks the value in ptr for around 3-4 seconds
+  while ((wall_clock64() - StrtTck) <= (3 * GpuFrq)) {
+  }
+  *A = 1;
+#endif
+}
+
 __global__ void MiniKernel(int *A) {
   if (*A == 0) {
     *A = 2;  //  Fail condition
@@ -189,12 +222,18 @@ static void EventSync() {
   HIP_CHECK(hipEventCreate(&start));
   HIP_CHECK(hipEventCreate(&end));
   HIP_CHECK(hipMemcpy(Ad, Ah, NumElms * sizeof(int), hipMemcpyHostToDevice));
-  HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
   dim3 dimBlock(blockSize, 1, 1);
   dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
   HIP_CHECK(hipEventRecord(start, hipStreamPerThread));
-  StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL, NumElms,
+  if (IsGfx11()) {
+    HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
+    StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL, NumElms,
+                                                                peak_clk, 0);
+  } else {
+    HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
+    StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL, NumElms,
                                                               peak_clk, 0);
+  }
   HIP_CHECK(hipEventRecord(end, hipStreamPerThread));
   HIP_CHECK(hipEventSynchronize(end));
   HIP_CHECK(hipMemcpy(Ah, Ad, NumElms * sizeof(int), hipMemcpyDeviceToHost));
@@ -226,12 +265,18 @@ TEST_CASE("Unit_hipStreamPerThreadTst_StrmQuery") {
     Ah[i] = CONST_NUM;
   }
   HIP_CHECK(hipMemcpy(Ad, Ah, NumElms * sizeof(int), hipMemcpyHostToDevice));
-  HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
   dim3 dimBlock(blockSize, 1, 1);
   dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
   SECTION("Test working of hipStreamQuery") {
-    StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
+    if (IsGfx11()) {
+      HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
+      StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
+                          NumElms, peak_clk, 1);
+    } else {
+      HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
+      StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
                     NumElms, peak_clk, 1);
+    }
     err = hipStreamQuery(hipStreamPerThread);
     if (err != hipErrorNotReady) {
       WARN("hipStreamQuery on hipStreamPerThread didnt return expected error!");
@@ -245,7 +290,11 @@ TEST_CASE("Unit_hipStreamPerThreadTst_StrmQuery") {
     HIP_CHECK(hipHostMalloc(&Hptr, sizeof(int)));
     *Hptr = 0;
     HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&A_d), Hptr, 0));
-    StreamPerThrd1<<<1, 1, 0, hipStreamPerThread>>>(A_d, peak_clk);
+    if (IsGfx11()) {
+      StreamPerThrd1_gfx11<<<1, 1, 0, hipStreamPerThread>>>(A_d, peak_clk);
+    } else {
+      StreamPerThrd1<<<1, 1, 0, hipStreamPerThread>>>(A_d, peak_clk);
+    }
     HIP_CHECK(hipStreamAddCallback(hipStreamPerThread, CallBackFunctn, A_d, 0));
     HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
     HIP_CHECK(hipHostFree(Hptr));
@@ -277,11 +326,17 @@ TEST_CASE("Unit_hipStreamPerThread_MangdMem") {
                 hipStreamPerThread));
     }
     int peak_clk;
-    HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
     dim3 dimBlock(blockSize, 1, 1);
     dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
-    StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Hmm, NULL,
+    if (IsGfx11()) {
+      HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
+      StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Hmm, NULL,
+                      NumElms, peak_clk, 0);
+    } else {
+      HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
+      StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Hmm, NULL,
                     NumElms, peak_clk, 0);
+    }
     HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
     // Validating the result
     int MisMatch = 0;
@@ -313,11 +368,17 @@ TEST_CASE("Unit_hipStreamPerThread_ChildProc") {
       Ah[i] = CONST_NUM;
     }
     HIP_CHECK(hipMemcpy(Ad, Ah, NumElms * sizeof(int), hipMemcpyHostToDevice));
-    HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
     dim3 dimBlock(blockSize, 1, 1);
     dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
-    StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
+    if (IsGfx11()) {
+      HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
+      StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
+                      NumElms, peak_clk, 0);
+    } else{
+      HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
+      StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
                     NumElms, peak_clk, 0);
+    }
     HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
     HIP_CHECK(hipMemcpy(Ah, Ad, NumElms * sizeof(int), hipMemcpyDeviceToHost));
     int MisMatch = 0;
@@ -380,13 +441,17 @@ TEST_CASE("Unit_hipStreamPerThread_StrmWaitEvt") {
   HIP_CHECK(hipMalloc(&Ad1, sizeof(int)));
   HIP_CHECK(hipMemset(Ad1, 0, sizeof(int)));
   int peak_clk;
-  HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
   dim3 dimBlock(blockSize, 1, 1);
   dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
   hipEvent_t e1;
   HIPCHECK(hipEventCreate(&e1));
-  StreamPerThrd<<<dimGrid, dimBlock, 0, Strm>>>(Ad, Ad1, NumElms,
-  peak_clk, 1, 1);
+  if (IsGfx11()) {
+    HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
+    StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, Strm>>>(Ad, Ad1, NumElms, peak_clk, 1, 1);
+  } else {
+    HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
+    StreamPerThrd<<<dimGrid, dimBlock, 0, Strm>>>(Ad, Ad1, NumElms, peak_clk, 1, 1);
+  }
   HIP_CHECK(hipEventRecord(e1, Strm));
   HIP_CHECK(hipStreamWaitEvent(hipStreamPerThread, e1, 0 /*flags*/));
   MiniKernel<<<1, 1, 0, hipStreamPerThread>>>(Ad1);