From d8b7cb28ff91e930fea1effd1003285eec326c9a Mon Sep 17 00:00:00 2001 From: ROCm CI Service Account <66695075+rocm-ci@users.noreply.github.com> Date: Sat, 7 Jan 2023 04:35:21 +0530 Subject: [PATCH] SWDEV-337452 - Changing Clock64 to WallClock64 in tests for gfx11. (#78) Change-Id: I484fe9ff7cd56c70a37a3ac5a4a55812f8557259 [ROCm/hip-tests commit: 87fac87657e661f44ddf5718ffbdaf61a452f21e] --- .../catch/include/hip_test_common.hh | 42 ++++++- projects/hip-tests/catch/include/utils.hh | 6 +- .../multiproc/hipMemCoherencyTstMProc.cc | 38 +++++- .../catch/unit/event/Unit_hipEventQuery.cc | 17 ++- .../hipGraphExecEventWaitNodeSetEvent.cc | 19 ++- .../catch/unit/memory/hipMemPoolApi.cc | 110 +++++++++++++++--- .../unit/memory/hipMemoryAllocateCoherent.cc | 24 +++- .../unit/stream/hipStreamACb_MultiThread.cc | 21 +++- .../catch/unit/stream/hipStreamWaitEvent.cc | 23 +++- .../streamperthread/hipStreamPerThrdTsts.cc | 89 ++++++++++++-- 10 files changed, 339 insertions(+), 50 deletions(-) diff --git a/projects/hip-tests/catch/include/hip_test_common.hh b/projects/hip-tests/catch/include/hip_test_common.hh index 5f7b197596..a7be3baca9 100644 --- a/projects/hip-tests/catch/include/hip_test_common.hh +++ b/projects/hip-tests/catch/include/hip_test_common.hh @@ -140,6 +140,31 @@ static void initHipCtx(hipCtx_t* pcontext) { #define HIP_ARRAY hipArray* #endif +static inline bool IsGfx11() { +#if defined(HT_NVIDIA) + return false; +#elif defined(HT_AMD) + int device = -1; + hipDeviceProp_t props{}; + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK(hipGetDeviceProperties(&props, device)); + + // Get GCN Arch Name and compare to check if it is gfx11 + std::string arch = std::string(props.gcnArchName); + auto pos = arch.find(":"); + if (pos != std::string::npos) + arch = arch.substr(0, pos); + + if(arch.size() >= 5) + arch = arch.substr(0,5); + + return (arch == std::string("gfx11")) ? true : false; +#else + std::cout<<"Have to be either Nvidia or AMD platform, asserting"< 0; --attempts) { HIP_CHECK(hipEventRecord(start)); - hipLaunchKernelGGL(waitKernel, dim3(1), dim3(1), 0, 0, clockTicksPerSecond); + hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, 0, clockTicksPerSecond); HIP_CHECK(hipEventRecord(stop)); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipEventSynchronize(stop)); @@ -396,7 +429,8 @@ static inline void runKernelForDuration(std::chrono::milliseconds duration, // precision so that's acceptable. static size_t ticksPerSecond = findTicksPerSecond(); const auto millis = duration.count(); - hipLaunchKernelGGL(waitKernel, dim3(1), dim3(1), 0, stream, ticksPerSecond * millis / 1000); + auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel; + hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, stream, ticksPerSecond * millis / 1000); } } // namespace HipTest diff --git a/projects/hip-tests/catch/include/utils.hh b/projects/hip-tests/catch/include/utils.hh index f3d6debb26..bb7fce06e4 100644 --- a/projects/hip-tests/catch/include/utils.hh +++ b/projects/hip-tests/catch/include/utils.hh @@ -128,7 +128,11 @@ __global__ void Iota(T* const out, size_t pitch, size_t w, size_t h, size_t d) { inline void LaunchDelayKernel(const std::chrono::milliseconds interval, const hipStream_t stream) { int ticks_per_ms = 0; // Clock rate is in kHz => number of clock ticks in a millisecond - HIP_CHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeClockRate, 0)); + if (IsGfx11()) { + HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeWallClockRate, 0)); + } else { + HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeClockRate, 0)); + } Delay<<<1, 1, 0, stream>>>(interval.count(), ticks_per_ms); HIP_CHECK(hipGetLastError()); } diff --git a/projects/hip-tests/catch/multiproc/hipMemCoherencyTstMProc.cc b/projects/hip-tests/catch/multiproc/hipMemCoherencyTstMProc.cc index 863dfc7178..8579aabc27 100644 --- a/projects/hip-tests/catch/multiproc/hipMemCoherencyTstMProc.cc +++ b/projects/hip-tests/catch/multiproc/hipMemCoherencyTstMProc.cc @@ -49,6 +49,19 @@ __global__ void CoherentTst(int *ptr, int PeakClk) { } } +__global__ void CoherentTst_gfx11(int *ptr, int PeakClk) { +#if HT_AMD + // Incrementing the value by 1 + int64_t GpuFrq = int64_t(PeakClk) * 1000; + int64_t StrtTck = wall_clock64(); + atomicAdd(ptr, 1); + // The following while loop checks the value in ptr for around 3-4 seconds + while ((wall_clock64() - StrtTck) <= (3 * GpuFrq)) { + if (atomicCAS(ptr, 3, 4) == 3) break; + } +#endif +} + __global__ void SquareKrnl(int *ptr) { // ptr value squared here *ptr = (*ptr) * (*ptr); @@ -64,14 +77,27 @@ static void TstCoherency(int *Ptr, bool HmmMem) { HIP_CHECK(hipStreamCreate(&strm)); // storing value 1 in the memory created above *Ptr = 1; + // Getting gpu frequency - HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0)); - if (!HmmMem) { - HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&Dptr), Ptr, - 0)); - CoherentTst<<<1, 1, 0, strm>>>(Dptr, peak_clk); + if (IsGfx11()) { + HIPCHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0)); } else { - CoherentTst<<<1, 1, 0, strm>>>(Ptr, peak_clk); + HIPCHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0)); + } + + if (!HmmMem) { + HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&Dptr), Ptr, 0)); + if (IsGfx11()) { + CoherentTst_gfx11<<<1, 1, 0, strm>>>(Dptr, peak_clk); + } else { + CoherentTst<<<1, 1, 0, strm>>>(Dptr, peak_clk); + } + } else { + if (IsGfx11()) { + CoherentTst_gfx11<<<1, 1, 0, strm>>>(Ptr, peak_clk); + } else { + CoherentTst<<<1, 1, 0, strm>>>(Ptr, peak_clk); + } } // looping until the value is 2 for 3 seconds std::chrono::steady_clock::time_point start = diff --git a/projects/hip-tests/catch/unit/event/Unit_hipEventQuery.cc b/projects/hip-tests/catch/unit/event/Unit_hipEventQuery.cc index d355a515e5..47d8995c18 100644 --- a/projects/hip-tests/catch/unit/event/Unit_hipEventQuery.cc +++ b/projects/hip-tests/catch/unit/event/Unit_hipEventQuery.cc @@ -34,6 +34,20 @@ __global__ void waitKernel(int clockRate, int seconds) { } } +__global__ void waitKernel_gfx11(int clockRate, int seconds) { +#if HT_AMD + auto start = wall_clock64(); + auto ms = seconds * 1000; + long long waitTill = clockRate * (long long)ms; + while (1) { + auto end = wall_clock64(); + if ((end - start) > waitTill) { + return; + } + } +#endif +} + TEST_CASE("Unit_hipEventQuery_DifferentDevice") { hipEvent_t event1{}, event2{}; HIP_CHECK(hipEventCreate(&event1)); @@ -54,9 +68,10 @@ TEST_CASE("Unit_hipEventQuery_DifferentDevice") { HIP_CHECK(hipSetDevice(0)); HIP_CHECK(hipEventRecord(event1, stream)); + auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel; // Start kernel and wait for 3 seconds // Make sure you increase this time if you add more tests here - waitKernel<<<1, 1, 0, stream>>>(clockRate, 3); + waitKernel_used<<<1, 1, 0, stream>>>(clockRate, 3); HIP_CHECK(hipEventRecord(event2, stream)); diff --git a/projects/hip-tests/catch/unit/graph/hipGraphExecEventWaitNodeSetEvent.cc b/projects/hip-tests/catch/unit/graph/hipGraphExecEventWaitNodeSetEvent.cc index 5a0d20f541..f8ee63d6f8 100644 --- a/projects/hip-tests/catch/unit/graph/hipGraphExecEventWaitNodeSetEvent.cc +++ b/projects/hip-tests/catch/unit/graph/hipGraphExecEventWaitNodeSetEvent.cc @@ -67,6 +67,16 @@ static __global__ void sqr_ker_func(int* a, int* b, int clockrate) { do { cur = clock64()/clockrate - start;}while (cur < wait_t); } +static __global__ void sqr_ker_func_gfx11(int* a, int* b, int clockrate) { +#if HT_AMD + int tx = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; + if (tx < LEN) b[tx] = a[tx]*a[tx]; + uint64_t wait_t = DELAY_IN_MS, + start = wall_clock64()/clockrate, cur; + do { cur = wall_clock64()/clockrate - start;}while (cur < wait_t); +#endif +} + /** * Scenario 1: Test to validate setting different events in executable graph. */ @@ -106,10 +116,15 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") { inp_h, memsize, hipMemcpyHostToDevice)); // Get device clock rate int clkRate = 0; - HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + if (IsGfx11()) { + HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0)); + } else { + HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + } // kernel1 + auto sqr_ker_func_used = IsGfx11() ? sqr_ker_func_gfx11 : sqr_ker_func; void* kernelArgs[] = {&inp_d, &out_d, reinterpret_cast(&clkRate)}; - kernelNodeParams1.func = reinterpret_cast(sqr_ker_func); + kernelNodeParams1.func = reinterpret_cast(sqr_ker_func_used); kernelNodeParams1.gridDim = dim3(GRID_DIM); kernelNodeParams1.blockDim = dim3(BLK_DIM); kernelNodeParams1.sharedMemBytes = 0; diff --git a/projects/hip-tests/catch/unit/memory/hipMemPoolApi.cc b/projects/hip-tests/catch/unit/memory/hipMemPoolApi.cc index b09d2432f4..927899f1b7 100644 --- a/projects/hip-tests/catch/unit/memory/hipMemPoolApi.cc +++ b/projects/hip-tests/catch/unit/memory/hipMemPoolApi.cc @@ -123,6 +123,21 @@ __global__ void kernel500ms(float* hostRes, int clkRate) { } } +__global__ void kernel500ms_gfx11(float* hostRes, int clkRate) { +#if HT_AMD + int tid = threadIdx.x + blockIdx.x * blockDim.x; + hostRes[tid] = tid + 1; + __threadfence_system(); + // expecting that the data is getting flushed to host here! + uint64_t start = wall_clock64()/clkRate, cur; + if (clkRate > 1) { + do { cur = wall_clock64()/clkRate-start;}while (cur < wait_ms); + } else { + do { cur = wall_clock64()/start;}while (cur < wait_ms); + } +#endif +} + TEST_CASE("Unit_hipMemPoolApi_BasicAlloc") { int mem_pool_support = 0; HIP_CHECK(hipDeviceGetAttribute(&mem_pool_support, hipDeviceAttributeMemoryPoolsSupported, 0)); @@ -147,9 +162,14 @@ TEST_CASE("Unit_hipMemPoolApi_BasicAlloc") { int blocks = 1024; int clkRate; hipMemPoolAttr attr; - HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + if (IsGfx11()) { + HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0)); + kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate); + } else { + HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); - kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate); + kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate); + } HIP_CHECK(hipFreeAsync(reinterpret_cast(B), stream)); @@ -229,9 +249,14 @@ TEST_CASE("Unit_hipMemPoolApi_BasicTrim") { int blocks = 2; int clkRate; - HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + if (IsGfx11()) { + HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0)); + kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate); + } else { + HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); - kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate); + kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate); + } hipMemPoolAttr attr; attr = hipMemPoolAttrReleaseThreshold; @@ -312,9 +337,15 @@ TEST_CASE("Unit_hipMemPoolApi_BasicReuse") { int blocks = 2; int clkRate; - HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); - kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate); + if (IsGfx11()) { + HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0)); + kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate); + } else { + HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + + kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate); + } hipMemPoolAttr attr; // Not a real free, since kernel isn't done @@ -329,7 +360,11 @@ TEST_CASE("Unit_hipMemPoolApi_BasicReuse") { HIP_CHECK(hipStreamSynchronize(stream)); // Second kernel launch with new memory - kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate); + if (IsGfx11()) { + kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate); + } else { + kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate); + } HIP_CHECK(hipStreamSynchronize(stream)); @@ -369,7 +404,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") { hipMemPoolAttr attr; int blocks = 2; int clkRate; - HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + if (IsGfx11()) { + HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0)); + } else { + HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + } float *A, *B, *C; hipStream_t stream, stream2; @@ -395,7 +434,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") { HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value)); // Run kernel for 500 ms in the first stream - kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate); + if (IsGfx11()) { + kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate); + } else { + kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate); + } // Not a real free, since kernel isn't done HIP_CHECK(hipFreeAsync(reinterpret_cast(A), stream)); @@ -410,7 +453,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") { REQUIRE(A != B); // Run kernel with the new memory in the second stream - kernel500ms<<<32, blocks, 0, stream2>>>(B, clkRate); + if (IsGfx11()) { + kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate); + } else { + kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate); + } HIP_CHECK(hipStreamSynchronize(stream)); HIP_CHECK(hipStreamSynchronize(stream2)); @@ -428,7 +475,13 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") { HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value)); // Run kernel for 500 ms in the first stream - kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate); + if (IsGfx11()) { + HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0)); + kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate); + } else { + HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate); + } // Not a real free, since kernel isn't done HIP_CHECK(hipFreeAsync(reinterpret_cast(A), stream)); @@ -443,7 +496,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") { REQUIRE(A == B); // Run kernel with the new memory in the second stream - kernel500ms<<<32, blocks, 0, stream2>>>(B, clkRate); + if (IsGfx11()) { + kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate); + } else { + kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate); + } HIP_CHECK(hipStreamSynchronize(stream)); HIP_CHECK(hipStreamSynchronize(stream2)); @@ -461,7 +518,12 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") { HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value)); // Run kernel for 500 ms in the first stream - kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate); + + if (IsGfx11()) { + kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate); + } else { + kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate); + } // Not a real free, since kernel isn't done HIP_CHECK(hipFreeAsync(reinterpret_cast(A), stream)); @@ -473,7 +535,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") { REQUIRE(A != B); // Run kernel with the new memory in the second stream - kernel500ms<<<32, blocks, 0, stream2>>>(B, clkRate); + if (IsGfx11()) { + kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate); + } else { + kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate); + } HIP_CHECK(hipStreamSynchronize(stream)); HIP_CHECK(hipStreamSynchronize(stream2)); @@ -510,9 +576,15 @@ TEST_CASE("Unit_hipMemPoolApi_Default") { int blocks = 2; int clkRate; - HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + + if (IsGfx11()) { + HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0)); + kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate); + } else { + HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); - kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate); + kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate); + } hipMemPoolAttr attr; // Not a real free, since kernel isn't done @@ -527,7 +599,11 @@ TEST_CASE("Unit_hipMemPoolApi_Default") { HIP_CHECK(hipStreamSynchronize(stream)); // Second kernel launch with new memory - kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate); + if (IsGfx11()) { + kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate); + } else { + kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate); + } HIP_CHECK(hipFreeAsync(reinterpret_cast(B), stream)); diff --git a/projects/hip-tests/catch/unit/memory/hipMemoryAllocateCoherent.cc b/projects/hip-tests/catch/unit/memory/hipMemoryAllocateCoherent.cc index 75c6c87b4e..e9fed1fc84 100644 --- a/projects/hip-tests/catch/unit/memory/hipMemoryAllocateCoherent.cc +++ b/projects/hip-tests/catch/unit/memory/hipMemoryAllocateCoherent.cc @@ -41,6 +41,21 @@ __global__ void Kernel(float* hostRes, int clkRate) { } } +__global__ void Kernel_gfx11(float* hostRes, int clkRate) { +#if HT_AMD + int tid = threadIdx.x + blockIdx.x * blockDim.x; + hostRes[tid] = tid + 1; + __threadfence_system(); + // expecting that the data is getting flushed to host here! + uint64_t start = wall_clock64()/clkRate, cur; + if (clkRate > 1) { + do { cur = wall_clock64()/clkRate-start;}while (cur < wait_sec); + } else { + do { cur = wall_clock64()/start;}while (cur < wait_sec); + } +#endif +} + TEST_CASE("Unit_hipHostMalloc_CoherentAccess") { int blocks = 2; float* hostRes; @@ -49,9 +64,14 @@ TEST_CASE("Unit_hipHostMalloc_CoherentAccess") { hostRes[0] = 0; hostRes[1] = 0; int clkRate; - HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + if (IsGfx11()) { + HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0)); + } else { + HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0)); + } std::cout << clkRate << std::endl; - hipLaunchKernelGGL(HIP_KERNEL_NAME(Kernel), dim3(1), dim3(blocks), + auto Kernel_used = IsGfx11() ? Kernel_gfx11 : Kernel; + hipLaunchKernelGGL(HIP_KERNEL_NAME(Kernel_used), dim3(1), dim3(blocks), 0, 0, hostRes, clkRate); HIP_CHECK(hipGetLastError()); int eleCounter = 0; diff --git a/projects/hip-tests/catch/unit/stream/hipStreamACb_MultiThread.cc b/projects/hip-tests/catch/unit/stream/hipStreamACb_MultiThread.cc index 09946ced5d..2825914cba 100644 --- a/projects/hip-tests/catch/unit/stream/hipStreamACb_MultiThread.cc +++ b/projects/hip-tests/catch/unit/stream/hipStreamACb_MultiThread.cc @@ -53,6 +53,24 @@ static __global__ void device_function(float* C_d, float* A_d, size_t Num) { } } +static __global__ void device_function_gfx11(float* C_d, float* A_d, size_t Num) { +#if HT_AMD + size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for (size_t i = gputhread; i < Num; i += stride) { + C_d[i] = A_d[i] * A_d[i]; + } + + // Delay thread 1 only in the GPU + if (gputhread == 1) { + uint64_t wait_t = 3200000000, start = wall_clock64(), cur; + do { + cur = wall_clock64() - start; + } while (cur < wait_t); + } +#endif +} static void HIPRT_CB Thread1_Callback(hipStream_t stream, hipError_t status, void* userData) { @@ -128,7 +146,8 @@ TEST_CASE("Unit_hipStreamAddCallback_MultipleThreads") { constexpr unsigned threadsPerBlock = 256; constexpr unsigned blocks = (N + 255)/threadsPerBlock; - hipLaunchKernelGGL((device_function), dim3(blocks), + auto device_function_used = IsGfx11() ? device_function_gfx11 : device_function; + hipLaunchKernelGGL((device_function_used), dim3(blocks), dim3(threadsPerBlock), 0, mystream, C_d, A_d, N); HIP_CHECK(hipGetLastError()); diff --git a/projects/hip-tests/catch/unit/stream/hipStreamWaitEvent.cc b/projects/hip-tests/catch/unit/stream/hipStreamWaitEvent.cc index d050efea2a..7965bc19ef 100644 --- a/projects/hip-tests/catch/unit/stream/hipStreamWaitEvent.cc +++ b/projects/hip-tests/catch/unit/stream/hipStreamWaitEvent.cc @@ -94,6 +94,20 @@ __global__ void waitKernel(int clockRate, int seconds) { } } +__global__ void waitKernel_gfx11(int clockRate, int seconds) { +#if HT_AMD + auto start = wall_clock64(); + auto ms = seconds * 1000; + long long waitTill = clockRate * (long long)ms; + while (1) { + auto end = wall_clock64(); + if ((end - start) > waitTill) { + return; + } + } +#endif +} + TEST_CASE("Unit_hipStreamWaitEvent_Default") { hipStream_t stream{nullptr}; hipEvent_t waitEvent{nullptr}; @@ -111,7 +125,8 @@ TEST_CASE("Unit_hipStreamWaitEvent_Default") { HIP_CHECK(hipGetDeviceProperties(&prop, deviceId)); auto clockRate = prop.clockRate; - waitKernel<<<1, 1, 0, stream>>>(clockRate, 2); // Wait for 2 seconds + auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel; + waitKernel_used<<<1, 1, 0, stream>>>(clockRate, 2); // Wait for 2 seconds HIP_CHECK(hipEventRecord(waitEvent, stream)); @@ -145,8 +160,8 @@ TEST_CASE("Unit_hipStreamWaitEvent_DifferentStreams") { hipDeviceProp_t prop{}; HIP_CHECK(hipGetDeviceProperties(&prop, deviceId)); auto clockRate = prop.clockRate; - - waitKernel<<<1, 1, 0, blockedStreamA>>>(clockRate, + auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel; + waitKernel_used<<<1, 1, 0, blockedStreamA>>>(clockRate, 3); // wait for 3 seconds HIP_CHECK(hipEventRecord(waitEvent, blockedStreamA)); @@ -155,7 +170,7 @@ TEST_CASE("Unit_hipStreamWaitEvent_DifferentStreams") { HIP_CHECK(hipStreamWaitEvent(streamBlockedOnStreamA, waitEvent, 0)); - waitKernel<<<1, 1, 0, streamBlockedOnStreamA>>>(clockRate, 2); // Wait for 2 seconds + waitKernel_used<<<1, 1, 0, streamBlockedOnStreamA>>>(clockRate, 2); // Wait for 2 seconds HIP_CHECK(hipStreamSynchronize(unblockingStream)); diff --git a/projects/hip-tests/catch/unit/streamperthread/hipStreamPerThrdTsts.cc b/projects/hip-tests/catch/unit/streamperthread/hipStreamPerThrdTsts.cc index dec35e100c..c14b38ce19 100644 --- a/projects/hip-tests/catch/unit/streamperthread/hipStreamPerThrdTsts.cc +++ b/projects/hip-tests/catch/unit/streamperthread/hipStreamPerThrdTsts.cc @@ -95,6 +95,39 @@ __global__ void StreamPerThrd1(int *A, int Pk_Clk) { *A = 1; } +__global__ void StreamPerThrd_gfx11(int *Ad, int *Ad1, size_t n, int Pk_Clk, + int Wait, int WaitEvnt = 0) { +#if HT_AMD + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < n) { + Ad[index] = Ad[index] + 10; + } + if (Wait) { + int64_t GpuFrq = (Pk_Clk * 1000); + int64_t StrtTck = wall_clock64(); + if (index == 0) { + // The following while loop checks the value in ptr for around 4 seconds + while ((wall_clock64() - StrtTck) <= (6 * GpuFrq)) { + } + if (WaitEvnt == 1) { + *Ad1 = 1; + } + } + } +#endif +} + +__global__ void StreamPerThrd1_gfx11(int *A, int Pk_Clk) { +#if HT_AMD + int64_t GpuFrq = (Pk_Clk * 1000); + int64_t StrtTck = wall_clock64(); + // The following while loop checks the value in ptr for around 3-4 seconds + while ((wall_clock64() - StrtTck) <= (3 * GpuFrq)) { + } + *A = 1; +#endif +} + __global__ void MiniKernel(int *A) { if (*A == 0) { *A = 2; // Fail condition @@ -189,12 +222,18 @@ static void EventSync() { HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&end)); HIP_CHECK(hipMemcpy(Ad, Ah, NumElms * sizeof(int), hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0)); dim3 dimBlock(blockSize, 1, 1); dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1); HIP_CHECK(hipEventRecord(start, hipStreamPerThread)); - StreamPerThrd<<>>(Ad, NULL, NumElms, + if (IsGfx11()) { + HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0)); + StreamPerThrd_gfx11<<>>(Ad, NULL, NumElms, + peak_clk, 0); + } else { + HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0)); + StreamPerThrd<<>>(Ad, NULL, NumElms, peak_clk, 0); + } HIP_CHECK(hipEventRecord(end, hipStreamPerThread)); HIP_CHECK(hipEventSynchronize(end)); HIP_CHECK(hipMemcpy(Ah, Ad, NumElms * sizeof(int), hipMemcpyDeviceToHost)); @@ -226,12 +265,18 @@ TEST_CASE("Unit_hipStreamPerThreadTst_StrmQuery") { Ah[i] = CONST_NUM; } HIP_CHECK(hipMemcpy(Ad, Ah, NumElms * sizeof(int), hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0)); dim3 dimBlock(blockSize, 1, 1); dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1); SECTION("Test working of hipStreamQuery") { - StreamPerThrd<<>>(Ad, NULL, + if (IsGfx11()) { + HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0)); + StreamPerThrd_gfx11<<>>(Ad, NULL, + NumElms, peak_clk, 1); + } else { + HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0)); + StreamPerThrd<<>>(Ad, NULL, NumElms, peak_clk, 1); + } err = hipStreamQuery(hipStreamPerThread); if (err != hipErrorNotReady) { WARN("hipStreamQuery on hipStreamPerThread didnt return expected error!"); @@ -245,7 +290,11 @@ TEST_CASE("Unit_hipStreamPerThreadTst_StrmQuery") { HIP_CHECK(hipHostMalloc(&Hptr, sizeof(int))); *Hptr = 0; HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast(&A_d), Hptr, 0)); - StreamPerThrd1<<<1, 1, 0, hipStreamPerThread>>>(A_d, peak_clk); + if (IsGfx11()) { + StreamPerThrd1_gfx11<<<1, 1, 0, hipStreamPerThread>>>(A_d, peak_clk); + } else { + StreamPerThrd1<<<1, 1, 0, hipStreamPerThread>>>(A_d, peak_clk); + } HIP_CHECK(hipStreamAddCallback(hipStreamPerThread, CallBackFunctn, A_d, 0)); HIP_CHECK(hipStreamSynchronize(hipStreamPerThread)); HIP_CHECK(hipHostFree(Hptr)); @@ -277,11 +326,17 @@ TEST_CASE("Unit_hipStreamPerThread_MangdMem") { hipStreamPerThread)); } int peak_clk; - HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0)); dim3 dimBlock(blockSize, 1, 1); dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1); - StreamPerThrd<<>>(Hmm, NULL, + if (IsGfx11()) { + HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0)); + StreamPerThrd_gfx11<<>>(Hmm, NULL, + NumElms, peak_clk, 0); + } else { + HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0)); + StreamPerThrd<<>>(Hmm, NULL, NumElms, peak_clk, 0); + } HIP_CHECK(hipStreamSynchronize(hipStreamPerThread)); // Validating the result int MisMatch = 0; @@ -313,11 +368,17 @@ TEST_CASE("Unit_hipStreamPerThread_ChildProc") { Ah[i] = CONST_NUM; } HIP_CHECK(hipMemcpy(Ad, Ah, NumElms * sizeof(int), hipMemcpyHostToDevice)); - HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0)); dim3 dimBlock(blockSize, 1, 1); dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1); - StreamPerThrd<<>>(Ad, NULL, + if (IsGfx11()) { + HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0)); + StreamPerThrd_gfx11<<>>(Ad, NULL, + NumElms, peak_clk, 0); + } else{ + HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0)); + StreamPerThrd<<>>(Ad, NULL, NumElms, peak_clk, 0); + } HIP_CHECK(hipStreamSynchronize(hipStreamPerThread)); HIP_CHECK(hipMemcpy(Ah, Ad, NumElms * sizeof(int), hipMemcpyDeviceToHost)); int MisMatch = 0; @@ -380,13 +441,17 @@ TEST_CASE("Unit_hipStreamPerThread_StrmWaitEvt") { HIP_CHECK(hipMalloc(&Ad1, sizeof(int))); HIP_CHECK(hipMemset(Ad1, 0, sizeof(int))); int peak_clk; - HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0)); dim3 dimBlock(blockSize, 1, 1); dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1); hipEvent_t e1; HIPCHECK(hipEventCreate(&e1)); - StreamPerThrd<<>>(Ad, Ad1, NumElms, - peak_clk, 1, 1); + if (IsGfx11()) { + HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0)); + StreamPerThrd_gfx11<<>>(Ad, Ad1, NumElms, peak_clk, 1, 1); + } else { + HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0)); + StreamPerThrd<<>>(Ad, Ad1, NumElms, peak_clk, 1, 1); + } HIP_CHECK(hipEventRecord(e1, Strm)); HIP_CHECK(hipStreamWaitEvent(hipStreamPerThread, e1, 0 /*flags*/)); MiniKernel<<<1, 1, 0, hipStreamPerThread>>>(Ad1);