SWDEV-337452 - Changing Clock64 to WallClock64 in tests for gfx11. (#78)

Change-Id: I484fe9ff7cd56c70a37a3ac5a4a55812f8557259 [ROCm/hip-tests commit: 87fac87657]
2023-01-07 04:35:21 +05:30
@@ -140,6 +140,31 @@ static void initHipCtx(hipCtx_t* pcontext) {
 #define HIP_ARRAY hipArray*
 #endif

+static inline bool IsGfx11() {
+#if defined(HT_NVIDIA)
+  return false;
+#elif defined(HT_AMD)
+  int device = -1;
+  hipDeviceProp_t props{};
+  HIP_CHECK(hipGetDevice(&device));
+  HIP_CHECK(hipGetDeviceProperties(&props, device));
+
+   // Get GCN Arch Name and compare to check if it is gfx11
+  std::string arch = std::string(props.gcnArchName);
+  auto pos = arch.find(":");
+  if (pos != std::string::npos)
+    arch = arch.substr(0, pos);
+
+  if(arch.size() >= 5)
+    arch = arch.substr(0,5);
+
+  return (arch == std::string("gfx11")) ? true : false;
+#else
+  std::cout<<"Have to be either Nvidia or AMD platform, asserting"<<std::endl;
+  assert(false);
+#endif
+}
+

 // Utility Functions
 namespace HipTest {
@@ -335,6 +360,14 @@ static __global__ void waitKernel(clock_t offset) {
  }
 }

+static __global__ void waitKernel_gfx11(clock_t offset) {
+#if HT_AMD
+  auto start = wall_clock64();
+  while ((wall_clock64() - start) < offset) {
+  }
+#endif
+}
+
 // helper function used to set the device frequency variable
 // estimates the number of clock ticks in 1 second
 static size_t findTicksPerSecond() {
@@ -350,9 +383,9 @@ static size_t findTicksPerSecond() {
  hipEvent_t start, stop;
  HIP_CHECK(hipEventCreate(&start));
  HIP_CHECK(hipEventCreate(&stop));
-
+  auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
  // Warmup
-  hipLaunchKernelGGL(waitKernel, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
+  hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
  HIP_CHECK(hipGetLastError());
  HIP_CHECK(hipDeviceSynchronize());

@@ -360,7 +393,7 @@ static size_t findTicksPerSecond() {
  // after 10 attempts the result is likely good enough so just accept it
  for (int attempts = 10; attempts > 0; --attempts) {
    HIP_CHECK(hipEventRecord(start));
-    hipLaunchKernelGGL(waitKernel, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
+    hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
    HIP_CHECK(hipEventRecord(stop));
    HIP_CHECK(hipGetLastError());
    HIP_CHECK(hipEventSynchronize(stop));
@@ -396,7 +429,8 @@ static inline void runKernelForDuration(std::chrono::milliseconds duration,
  // precision so that's acceptable.
  static size_t ticksPerSecond = findTicksPerSecond();
  const auto millis = duration.count();
-  hipLaunchKernelGGL(waitKernel, dim3(1), dim3(1), 0, stream, ticksPerSecond * millis / 1000);
+  auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
+  hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, stream, ticksPerSecond * millis / 1000);
 }

 }  // namespace HipTest
@@ -128,7 +128,11 @@ __global__ void Iota(T* const out, size_t pitch, size_t w, size_t h, size_t d) {
 inline void LaunchDelayKernel(const std::chrono::milliseconds interval, const hipStream_t stream) {
  int ticks_per_ms = 0;
  // Clock rate is in kHz => number of clock ticks in a millisecond
-  HIP_CHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeClockRate, 0));
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeClockRate, 0));
+  }
  Delay<<<1, 1, 0, stream>>>(interval.count(), ticks_per_ms);
  HIP_CHECK(hipGetLastError());
 }
@@ -49,6 +49,19 @@ __global__  void CoherentTst(int *ptr, int PeakClk) {
  }
 }

+__global__  void CoherentTst_gfx11(int *ptr, int PeakClk) {
+#if HT_AMD
+  // Incrementing the value by 1
+  int64_t GpuFrq = int64_t(PeakClk) * 1000;
+  int64_t StrtTck = wall_clock64();
+  atomicAdd(ptr, 1);
+  // The following while loop checks the value in ptr for around 3-4 seconds
+  while ((wall_clock64() - StrtTck) <= (3 * GpuFrq)) {
+    if (atomicCAS(ptr, 3, 4) == 3) break;
+  }
+#endif
+}
+
 __global__  void SquareKrnl(int *ptr) {
  // ptr value squared here
  *ptr = (*ptr) * (*ptr);
@@ -64,14 +77,27 @@ static void TstCoherency(int *Ptr, bool HmmMem) {
  HIP_CHECK(hipStreamCreate(&strm));
  // storing value 1 in the memory created above
  *Ptr = 1;
+
  // Getting gpu frequency
-  HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
-  if (!HmmMem) {
-    HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void **>(&Dptr), Ptr,
-                                      0));
-    CoherentTst<<<1, 1, 0, strm>>>(Dptr, peak_clk);
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
  } else {
-    CoherentTst<<<1, 1, 0, strm>>>(Ptr, peak_clk);
+    HIPCHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
+  }
+
+  if (!HmmMem) {
+    HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void **>(&Dptr), Ptr, 0));
+    if (IsGfx11()) {
+      CoherentTst_gfx11<<<1, 1, 0, strm>>>(Dptr, peak_clk);
+    } else {
+      CoherentTst<<<1, 1, 0, strm>>>(Dptr, peak_clk);
+    }
+  } else {
+    if (IsGfx11()) {
+      CoherentTst_gfx11<<<1, 1, 0, strm>>>(Ptr, peak_clk);
+    } else {
+      CoherentTst<<<1, 1, 0, strm>>>(Ptr, peak_clk);
+    }
  }
  // looping until the value is 2 for 3 seconds
  std::chrono::steady_clock::time_point start =
@@ -34,6 +34,20 @@ __global__ void waitKernel(int clockRate, int seconds) {
  }
 }

+__global__ void waitKernel_gfx11(int clockRate, int seconds) {
+#if HT_AMD
+  auto start = wall_clock64();
+  auto ms = seconds * 1000;
+  long long waitTill = clockRate * (long long)ms;
+  while (1) {
+    auto end = wall_clock64();
+    if ((end - start) > waitTill) {
+      return;
+    }
+  }
+#endif
+}
+
 TEST_CASE("Unit_hipEventQuery_DifferentDevice") {
  hipEvent_t event1{}, event2{};
  HIP_CHECK(hipEventCreate(&event1));
@@ -54,9 +68,10 @@ TEST_CASE("Unit_hipEventQuery_DifferentDevice") {
    HIP_CHECK(hipSetDevice(0));
    HIP_CHECK(hipEventRecord(event1, stream));

+    auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
    // Start kernel and wait for 3 seconds
    // Make sure you increase this time if you add more tests here
-    waitKernel<<<1, 1, 0, stream>>>(clockRate, 3);
+    waitKernel_used<<<1, 1, 0, stream>>>(clockRate, 3);

    HIP_CHECK(hipEventRecord(event2, stream));

@@ -67,6 +67,16 @@ static __global__ void sqr_ker_func(int* a, int* b, int clockrate) {
  do { cur = clock64()/clockrate - start;}while (cur < wait_t);
 }

+static __global__ void sqr_ker_func_gfx11(int* a, int* b, int clockrate) {
+#if HT_AMD
+  int tx = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
+  if (tx < LEN) b[tx] = a[tx]*a[tx];
+  uint64_t wait_t = DELAY_IN_MS,
+  start = wall_clock64()/clockrate, cur;
+  do { cur = wall_clock64()/clockrate - start;}while (cur < wait_t);
+#endif
+}
+
 /**
 * Scenario 1: Test to validate setting different events in executable graph.
 */
@@ -106,10 +116,15 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
                inp_h, memsize, hipMemcpyHostToDevice));
  // Get device clock rate
  int clkRate = 0;
-  HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  }
  // kernel1
+  auto sqr_ker_func_used = IsGfx11() ? sqr_ker_func_gfx11 : sqr_ker_func;
  void* kernelArgs[] = {&inp_d, &out_d, reinterpret_cast<void *>(&clkRate)};
-  kernelNodeParams1.func = reinterpret_cast<void *>(sqr_ker_func);
+  kernelNodeParams1.func = reinterpret_cast<void *>(sqr_ker_func_used);
  kernelNodeParams1.gridDim = dim3(GRID_DIM);
  kernelNodeParams1.blockDim = dim3(BLK_DIM);
  kernelNodeParams1.sharedMemBytes = 0;
@@ -123,6 +123,21 @@ __global__ void kernel500ms(float* hostRes, int clkRate) {
  }
 }

+__global__ void kernel500ms_gfx11(float* hostRes, int clkRate) {
+#if HT_AMD
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  hostRes[tid] = tid + 1;
+  __threadfence_system();
+  // expecting that the data is getting flushed to host here!
+  uint64_t start = wall_clock64()/clkRate, cur;
+  if (clkRate > 1) {
+    do { cur = wall_clock64()/clkRate-start;}while (cur < wait_ms);
+  } else {
+    do { cur = wall_clock64()/start;}while (cur < wait_ms);
+  }
+#endif
+}
+
 TEST_CASE("Unit_hipMemPoolApi_BasicAlloc") {
  int mem_pool_support = 0;
  HIP_CHECK(hipDeviceGetAttribute(&mem_pool_support, hipDeviceAttributeMemoryPoolsSupported, 0));
@@ -147,9 +162,14 @@ TEST_CASE("Unit_hipMemPoolApi_BasicAlloc") {
  int blocks = 1024;
  int clkRate;
  hipMemPoolAttr attr;
-  HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  if (IsGfx11()) {
+    HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+    kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
+  } else {
+    HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));

-  kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+    kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+  }

  HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(B), stream));

@@ -229,9 +249,14 @@ TEST_CASE("Unit_hipMemPoolApi_BasicTrim") {

  int blocks = 2;
  int clkRate;
-  HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  if (IsGfx11()) {
+    HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+    kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
+  } else {
+    HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));

-  kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+    kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+  }

  hipMemPoolAttr attr;
  attr = hipMemPoolAttrReleaseThreshold;
@@ -312,9 +337,15 @@ TEST_CASE("Unit_hipMemPoolApi_BasicReuse") {

  int blocks = 2;
  int clkRate;
-  HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));

-  kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+  if (IsGfx11()) {
+    HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+    kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
+  } else {
+    HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+
+    kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+  }

  hipMemPoolAttr attr;
  // Not a real free, since kernel isn't done
@@ -329,7 +360,11 @@ TEST_CASE("Unit_hipMemPoolApi_BasicReuse") {
  HIP_CHECK(hipStreamSynchronize(stream));

  // Second kernel launch with new memory
-  kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+  if (IsGfx11()) {
+    kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
+  } else {
+    kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+  }

  HIP_CHECK(hipStreamSynchronize(stream));

@@ -369,7 +404,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
  hipMemPoolAttr attr;
  int blocks = 2;
  int clkRate;
-  HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+   if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  }

  float *A, *B, *C;
  hipStream_t stream, stream2;
@@ -395,7 +434,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
    HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value));

    // Run kernel for 500 ms in the first stream
-    kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+    if (IsGfx11()) {
+      kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
+    } else {
+      kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+    }

    // Not a real free, since kernel isn't done
    HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(A), stream));
@@ -410,7 +453,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
    REQUIRE(A != B);

    // Run kernel with the new memory in the second stream
-    kernel500ms<<<32, blocks, 0, stream2>>>(B, clkRate);
+    if (IsGfx11()) {
+      kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
+    } else {
+      kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+    }

    HIP_CHECK(hipStreamSynchronize(stream));
    HIP_CHECK(hipStreamSynchronize(stream2));
@@ -428,7 +475,13 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
    HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value));

    // Run kernel for 500 ms in the first stream
-    kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+    if (IsGfx11()) {
+      HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+      kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
+    } else {
+      HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+      kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+    }

    // Not a real free, since kernel isn't done
    HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(A), stream));
@@ -443,7 +496,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
    REQUIRE(A == B);

    // Run kernel with the new memory in the second stream
-    kernel500ms<<<32, blocks, 0, stream2>>>(B, clkRate);
+    if (IsGfx11()) {
+      kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
+    } else {
+      kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+    }

    HIP_CHECK(hipStreamSynchronize(stream));
    HIP_CHECK(hipStreamSynchronize(stream2));
@@ -461,7 +518,12 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
    HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value));

    // Run kernel for 500 ms in the first stream
-    kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+      
+    if (IsGfx11()) {
+      kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
+    } else {
+      kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+    }

    // Not a real free, since kernel isn't done
    HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(A), stream));
@@ -473,7 +535,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
    REQUIRE(A != B);

    // Run kernel with the new memory in the second stream
-    kernel500ms<<<32, blocks, 0, stream2>>>(B, clkRate);
+    if (IsGfx11()) {
+      kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
+    } else {
+      kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+    }

    HIP_CHECK(hipStreamSynchronize(stream));
    HIP_CHECK(hipStreamSynchronize(stream2));
@@ -510,9 +576,15 @@ TEST_CASE("Unit_hipMemPoolApi_Default") {

  int blocks = 2;
  int clkRate;
-  HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+    
+  if (IsGfx11()) {
+    HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+    kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
+  } else {
+    HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));

-  kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+    kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
+  }

  hipMemPoolAttr attr;
  // Not a real free, since kernel isn't done
@@ -527,7 +599,11 @@ TEST_CASE("Unit_hipMemPoolApi_Default") {
  HIP_CHECK(hipStreamSynchronize(stream));

  // Second kernel launch with new memory
-  kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+  if (IsGfx11()) {
+    kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
+  } else {
+    kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
+  }

  HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(B), stream));

@@ -41,6 +41,21 @@ __global__ void Kernel(float* hostRes, int clkRate) {
  }
 }

+__global__ void Kernel_gfx11(float* hostRes, int clkRate) {
+#if HT_AMD
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  hostRes[tid] = tid + 1;
+  __threadfence_system();
+  // expecting that the data is getting flushed to host here!
+  uint64_t start = wall_clock64()/clkRate, cur;
+  if (clkRate > 1) {
+    do { cur = wall_clock64()/clkRate-start;}while (cur < wait_sec);
+  } else {
+    do { cur = wall_clock64()/start;}while (cur < wait_sec);
+  }
+#endif
+}
+
 TEST_CASE("Unit_hipHostMalloc_CoherentAccess") {
  int blocks = 2;
  float* hostRes;
@@ -49,9 +64,14 @@ TEST_CASE("Unit_hipHostMalloc_CoherentAccess") {
  hostRes[0] = 0;
  hostRes[1] = 0;
  int clkRate;
-  HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  if (IsGfx11()) {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
+  } else {
+    HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
+  }
  std::cout << clkRate << std::endl;
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(Kernel), dim3(1), dim3(blocks),
+  auto Kernel_used = IsGfx11() ? Kernel_gfx11 : Kernel;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(Kernel_used), dim3(1), dim3(blocks),
                     0, 0, hostRes, clkRate);
  HIP_CHECK(hipGetLastError());
  int eleCounter = 0;
@@ -53,6 +53,24 @@ static __global__ void device_function(float* C_d, float* A_d, size_t Num) {
  }
 }

+static __global__ void device_function_gfx11(float* C_d, float* A_d, size_t Num) {
+#if HT_AMD
+  size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = gputhread; i < Num; i += stride) {
+    C_d[i] = A_d[i] * A_d[i];
+  }
+
+  // Delay thread 1 only in the GPU
+  if (gputhread == 1) {
+    uint64_t wait_t = 3200000000, start = wall_clock64(), cur;
+    do {
+      cur = wall_clock64() - start;
+    } while (cur < wait_t);
+  }
+#endif
+}

 static void HIPRT_CB Thread1_Callback(hipStream_t stream, hipError_t status,
                                      void* userData) {
@@ -128,7 +146,8 @@ TEST_CASE("Unit_hipStreamAddCallback_MultipleThreads") {
  constexpr unsigned threadsPerBlock = 256;
  constexpr unsigned blocks = (N + 255)/threadsPerBlock;

-  hipLaunchKernelGGL((device_function), dim3(blocks),
+  auto device_function_used = IsGfx11() ? device_function_gfx11 : device_function;
+  hipLaunchKernelGGL((device_function_used), dim3(blocks),
                     dim3(threadsPerBlock), 0,
                     mystream, C_d, A_d, N);
  HIP_CHECK(hipGetLastError());
@@ -94,6 +94,20 @@ __global__ void waitKernel(int clockRate, int seconds) {
  }
 }

+__global__ void waitKernel_gfx11(int clockRate, int seconds) {
+#if HT_AMD
+  auto start = wall_clock64();
+  auto ms = seconds * 1000;
+  long long waitTill = clockRate * (long long)ms;
+  while (1) {
+    auto end = wall_clock64();
+    if ((end - start) > waitTill) {
+      return;
+    }
+  }
+#endif
+}
+
 TEST_CASE("Unit_hipStreamWaitEvent_Default") {
  hipStream_t stream{nullptr};
  hipEvent_t waitEvent{nullptr};
@@ -111,7 +125,8 @@ TEST_CASE("Unit_hipStreamWaitEvent_Default") {
  HIP_CHECK(hipGetDeviceProperties(&prop, deviceId));
  auto clockRate = prop.clockRate;

-  waitKernel<<<1, 1, 0, stream>>>(clockRate, 2);  // Wait for 2 seconds
+  auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
+  waitKernel_used<<<1, 1, 0, stream>>>(clockRate, 2);  // Wait for 2 seconds

  HIP_CHECK(hipEventRecord(waitEvent, stream));

@@ -145,8 +160,8 @@ TEST_CASE("Unit_hipStreamWaitEvent_DifferentStreams") {
  hipDeviceProp_t prop{};
  HIP_CHECK(hipGetDeviceProperties(&prop, deviceId));
  auto clockRate = prop.clockRate;
-
-  waitKernel<<<1, 1, 0, blockedStreamA>>>(clockRate,
+  auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
+  waitKernel_used<<<1, 1, 0, blockedStreamA>>>(clockRate,
                                          3);  // wait for 3 seconds
  HIP_CHECK(hipEventRecord(waitEvent, blockedStreamA));

@@ -155,7 +170,7 @@ TEST_CASE("Unit_hipStreamWaitEvent_DifferentStreams") {

  HIP_CHECK(hipStreamWaitEvent(streamBlockedOnStreamA, waitEvent, 0));

-  waitKernel<<<1, 1, 0, streamBlockedOnStreamA>>>(clockRate, 2);  // Wait for 2 seconds
+  waitKernel_used<<<1, 1, 0, streamBlockedOnStreamA>>>(clockRate, 2);  // Wait for 2 seconds

  HIP_CHECK(hipStreamSynchronize(unblockingStream));

@@ -95,6 +95,39 @@ __global__ void StreamPerThrd1(int *A, int Pk_Clk) {
  *A = 1;
 }

+__global__ void StreamPerThrd_gfx11(int *Ad, int *Ad1, size_t n, int Pk_Clk,
+                              int Wait, int WaitEvnt  = 0) {
+#if HT_AMD
+  size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < n) {
+    Ad[index] = Ad[index] + 10;
+  }
+  if (Wait) {
+    int64_t GpuFrq = (Pk_Clk * 1000);
+    int64_t StrtTck = wall_clock64();
+    if (index == 0) {
+      // The following while loop checks the value in ptr for around 4 seconds
+      while ((wall_clock64() - StrtTck) <= (6 * GpuFrq)) {
+      }
+      if (WaitEvnt == 1) {
+        *Ad1 = 1;
+      }
+    }
+  }
+#endif
+}
+
+__global__ void StreamPerThrd1_gfx11(int *A, int Pk_Clk) {
+#if HT_AMD
+  int64_t GpuFrq = (Pk_Clk * 1000);
+  int64_t StrtTck = wall_clock64();
+  // The following while loop checks the value in ptr for around 3-4 seconds
+  while ((wall_clock64() - StrtTck) <= (3 * GpuFrq)) {
+  }
+  *A = 1;
+#endif
+}
+
 __global__ void MiniKernel(int *A) {
  if (*A == 0) {
    *A = 2;  //  Fail condition
@@ -189,12 +222,18 @@ static void EventSync() {
  HIP_CHECK(hipEventCreate(&start));
  HIP_CHECK(hipEventCreate(&end));
  HIP_CHECK(hipMemcpy(Ad, Ah, NumElms * sizeof(int), hipMemcpyHostToDevice));
-  HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
  dim3 dimBlock(blockSize, 1, 1);
  dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
  HIP_CHECK(hipEventRecord(start, hipStreamPerThread));
-  StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL, NumElms,
+  if (IsGfx11()) {
+    HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
+    StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL, NumElms,
+                                                                peak_clk, 0);
+  } else {
+    HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
+    StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL, NumElms,
                                                              peak_clk, 0);
+  }
  HIP_CHECK(hipEventRecord(end, hipStreamPerThread));
  HIP_CHECK(hipEventSynchronize(end));
  HIP_CHECK(hipMemcpy(Ah, Ad, NumElms * sizeof(int), hipMemcpyDeviceToHost));
@@ -226,12 +265,18 @@ TEST_CASE("Unit_hipStreamPerThreadTst_StrmQuery") {
    Ah[i] = CONST_NUM;
  }
  HIP_CHECK(hipMemcpy(Ad, Ah, NumElms * sizeof(int), hipMemcpyHostToDevice));
-  HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
  dim3 dimBlock(blockSize, 1, 1);
  dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
  SECTION("Test working of hipStreamQuery") {
-    StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
+    if (IsGfx11()) {
+      HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
+      StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
+                          NumElms, peak_clk, 1);
+    } else {
+      HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
+      StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
                    NumElms, peak_clk, 1);
+    }
    err = hipStreamQuery(hipStreamPerThread);
    if (err != hipErrorNotReady) {
      WARN("hipStreamQuery on hipStreamPerThread didnt return expected error!");
@@ -245,7 +290,11 @@ TEST_CASE("Unit_hipStreamPerThreadTst_StrmQuery") {
    HIP_CHECK(hipHostMalloc(&Hptr, sizeof(int)));
    *Hptr = 0;
    HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&A_d), Hptr, 0));
-    StreamPerThrd1<<<1, 1, 0, hipStreamPerThread>>>(A_d, peak_clk);
+    if (IsGfx11()) {
+      StreamPerThrd1_gfx11<<<1, 1, 0, hipStreamPerThread>>>(A_d, peak_clk);
+    } else {
+      StreamPerThrd1<<<1, 1, 0, hipStreamPerThread>>>(A_d, peak_clk);
+    }
    HIP_CHECK(hipStreamAddCallback(hipStreamPerThread, CallBackFunctn, A_d, 0));
    HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
    HIP_CHECK(hipHostFree(Hptr));
@@ -277,11 +326,17 @@ TEST_CASE("Unit_hipStreamPerThread_MangdMem") {
                hipStreamPerThread));
    }
    int peak_clk;
-    HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
    dim3 dimBlock(blockSize, 1, 1);
    dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
-    StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Hmm, NULL,
+    if (IsGfx11()) {
+      HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
+      StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Hmm, NULL,
+                      NumElms, peak_clk, 0);
+    } else {
+      HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
+      StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Hmm, NULL,
                    NumElms, peak_clk, 0);
+    }
    HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
    // Validating the result
    int MisMatch = 0;
@@ -313,11 +368,17 @@ TEST_CASE("Unit_hipStreamPerThread_ChildProc") {
      Ah[i] = CONST_NUM;
    }
    HIP_CHECK(hipMemcpy(Ad, Ah, NumElms * sizeof(int), hipMemcpyHostToDevice));
-    HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
    dim3 dimBlock(blockSize, 1, 1);
    dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
-    StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
+    if (IsGfx11()) {
+      HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
+      StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
+                      NumElms, peak_clk, 0);
+    } else{
+      HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
+      StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
                    NumElms, peak_clk, 0);
+    }
    HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
    HIP_CHECK(hipMemcpy(Ah, Ad, NumElms * sizeof(int), hipMemcpyDeviceToHost));
    int MisMatch = 0;
@@ -380,13 +441,17 @@ TEST_CASE("Unit_hipStreamPerThread_StrmWaitEvt") {
  HIP_CHECK(hipMalloc(&Ad1, sizeof(int)));
  HIP_CHECK(hipMemset(Ad1, 0, sizeof(int)));
  int peak_clk;
-  HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
  dim3 dimBlock(blockSize, 1, 1);
  dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
  hipEvent_t e1;
  HIPCHECK(hipEventCreate(&e1));
-  StreamPerThrd<<<dimGrid, dimBlock, 0, Strm>>>(Ad, Ad1, NumElms,
-  peak_clk, 1, 1);
+  if (IsGfx11()) {
+    HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
+    StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, Strm>>>(Ad, Ad1, NumElms, peak_clk, 1, 1);
+  } else {
+    HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
+    StreamPerThrd<<<dimGrid, dimBlock, 0, Strm>>>(Ad, Ad1, NumElms, peak_clk, 1, 1);
+  }
  HIP_CHECK(hipEventRecord(e1, Strm));
  HIP_CHECK(hipStreamWaitEvent(hipStreamPerThread, e1, 0 /*flags*/));
  MiniKernel<<<1, 1, 0, hipStreamPerThread>>>(Ad1);