SWDEV-337452 - Changing Clock64 to WallClock64 in tests for gfx11. (#78)

Change-Id: I484fe9ff7cd56c70a37a3ac5a4a55812f8557259


[ROCm/hip-tests commit: 87fac87657]
Tento commit je obsažen v:
ROCm CI Service Account
2023-01-07 04:35:21 +05:30
odevzdal GitHub
rodič 9bc91a13b2
revize d8b7cb28ff
10 změnil soubory, kde provedl 339 přidání a 50 odebrání
+38 -4
Zobrazit soubor
@@ -140,6 +140,31 @@ static void initHipCtx(hipCtx_t* pcontext) {
#define HIP_ARRAY hipArray*
#endif
static inline bool IsGfx11() {
#if defined(HT_NVIDIA)
return false;
#elif defined(HT_AMD)
int device = -1;
hipDeviceProp_t props{};
HIP_CHECK(hipGetDevice(&device));
HIP_CHECK(hipGetDeviceProperties(&props, device));
// Get GCN Arch Name and compare to check if it is gfx11
std::string arch = std::string(props.gcnArchName);
auto pos = arch.find(":");
if (pos != std::string::npos)
arch = arch.substr(0, pos);
if(arch.size() >= 5)
arch = arch.substr(0,5);
return (arch == std::string("gfx11")) ? true : false;
#else
std::cout<<"Have to be either Nvidia or AMD platform, asserting"<<std::endl;
assert(false);
#endif
}
// Utility Functions
namespace HipTest {
@@ -335,6 +360,14 @@ static __global__ void waitKernel(clock_t offset) {
}
}
static __global__ void waitKernel_gfx11(clock_t offset) {
#if HT_AMD
auto start = wall_clock64();
while ((wall_clock64() - start) < offset) {
}
#endif
}
// helper function used to set the device frequency variable
// estimates the number of clock ticks in 1 second
static size_t findTicksPerSecond() {
@@ -350,9 +383,9 @@ static size_t findTicksPerSecond() {
hipEvent_t start, stop;
HIP_CHECK(hipEventCreate(&start));
HIP_CHECK(hipEventCreate(&stop));
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
// Warmup
hipLaunchKernelGGL(waitKernel, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipDeviceSynchronize());
@@ -360,7 +393,7 @@ static size_t findTicksPerSecond() {
// after 10 attempts the result is likely good enough so just accept it
for (int attempts = 10; attempts > 0; --attempts) {
HIP_CHECK(hipEventRecord(start));
hipLaunchKernelGGL(waitKernel, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
HIP_CHECK(hipEventRecord(stop));
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipEventSynchronize(stop));
@@ -396,7 +429,8 @@ static inline void runKernelForDuration(std::chrono::milliseconds duration,
// precision so that's acceptable.
static size_t ticksPerSecond = findTicksPerSecond();
const auto millis = duration.count();
hipLaunchKernelGGL(waitKernel, dim3(1), dim3(1), 0, stream, ticksPerSecond * millis / 1000);
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, stream, ticksPerSecond * millis / 1000);
}
} // namespace HipTest
+5 -1
Zobrazit soubor
@@ -128,7 +128,11 @@ __global__ void Iota(T* const out, size_t pitch, size_t w, size_t h, size_t d) {
inline void LaunchDelayKernel(const std::chrono::milliseconds interval, const hipStream_t stream) {
int ticks_per_ms = 0;
// Clock rate is in kHz => number of clock ticks in a millisecond
HIP_CHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeClockRate, 0));
if (IsGfx11()) {
HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeWallClockRate, 0));
} else {
HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeClockRate, 0));
}
Delay<<<1, 1, 0, stream>>>(interval.count(), ticks_per_ms);
HIP_CHECK(hipGetLastError());
}
+32 -6
Zobrazit soubor
@@ -49,6 +49,19 @@ __global__ void CoherentTst(int *ptr, int PeakClk) {
}
}
__global__ void CoherentTst_gfx11(int *ptr, int PeakClk) {
#if HT_AMD
// Incrementing the value by 1
int64_t GpuFrq = int64_t(PeakClk) * 1000;
int64_t StrtTck = wall_clock64();
atomicAdd(ptr, 1);
// The following while loop checks the value in ptr for around 3-4 seconds
while ((wall_clock64() - StrtTck) <= (3 * GpuFrq)) {
if (atomicCAS(ptr, 3, 4) == 3) break;
}
#endif
}
__global__ void SquareKrnl(int *ptr) {
// ptr value squared here
*ptr = (*ptr) * (*ptr);
@@ -64,14 +77,27 @@ static void TstCoherency(int *Ptr, bool HmmMem) {
HIP_CHECK(hipStreamCreate(&strm));
// storing value 1 in the memory created above
*Ptr = 1;
// Getting gpu frequency
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
if (!HmmMem) {
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void **>(&Dptr), Ptr,
0));
CoherentTst<<<1, 1, 0, strm>>>(Dptr, peak_clk);
if (IsGfx11()) {
HIPCHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
} else {
CoherentTst<<<1, 1, 0, strm>>>(Ptr, peak_clk);
HIPCHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
}
if (!HmmMem) {
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void **>(&Dptr), Ptr, 0));
if (IsGfx11()) {
CoherentTst_gfx11<<<1, 1, 0, strm>>>(Dptr, peak_clk);
} else {
CoherentTst<<<1, 1, 0, strm>>>(Dptr, peak_clk);
}
} else {
if (IsGfx11()) {
CoherentTst_gfx11<<<1, 1, 0, strm>>>(Ptr, peak_clk);
} else {
CoherentTst<<<1, 1, 0, strm>>>(Ptr, peak_clk);
}
}
// looping until the value is 2 for 3 seconds
std::chrono::steady_clock::time_point start =
+16 -1
Zobrazit soubor
@@ -34,6 +34,20 @@ __global__ void waitKernel(int clockRate, int seconds) {
}
}
__global__ void waitKernel_gfx11(int clockRate, int seconds) {
#if HT_AMD
auto start = wall_clock64();
auto ms = seconds * 1000;
long long waitTill = clockRate * (long long)ms;
while (1) {
auto end = wall_clock64();
if ((end - start) > waitTill) {
return;
}
}
#endif
}
TEST_CASE("Unit_hipEventQuery_DifferentDevice") {
hipEvent_t event1{}, event2{};
HIP_CHECK(hipEventCreate(&event1));
@@ -54,9 +68,10 @@ TEST_CASE("Unit_hipEventQuery_DifferentDevice") {
HIP_CHECK(hipSetDevice(0));
HIP_CHECK(hipEventRecord(event1, stream));
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
// Start kernel and wait for 3 seconds
// Make sure you increase this time if you add more tests here
waitKernel<<<1, 1, 0, stream>>>(clockRate, 3);
waitKernel_used<<<1, 1, 0, stream>>>(clockRate, 3);
HIP_CHECK(hipEventRecord(event2, stream));
@@ -67,6 +67,16 @@ static __global__ void sqr_ker_func(int* a, int* b, int clockrate) {
do { cur = clock64()/clockrate - start;}while (cur < wait_t);
}
static __global__ void sqr_ker_func_gfx11(int* a, int* b, int clockrate) {
#if HT_AMD
int tx = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
if (tx < LEN) b[tx] = a[tx]*a[tx];
uint64_t wait_t = DELAY_IN_MS,
start = wall_clock64()/clockrate, cur;
do { cur = wall_clock64()/clockrate - start;}while (cur < wait_t);
#endif
}
/**
* Scenario 1: Test to validate setting different events in executable graph.
*/
@@ -106,10 +116,15 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
inp_h, memsize, hipMemcpyHostToDevice));
// Get device clock rate
int clkRate = 0;
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
if (IsGfx11()) {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
} else {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
}
// kernel1
auto sqr_ker_func_used = IsGfx11() ? sqr_ker_func_gfx11 : sqr_ker_func;
void* kernelArgs[] = {&inp_d, &out_d, reinterpret_cast<void *>(&clkRate)};
kernelNodeParams1.func = reinterpret_cast<void *>(sqr_ker_func);
kernelNodeParams1.func = reinterpret_cast<void *>(sqr_ker_func_used);
kernelNodeParams1.gridDim = dim3(GRID_DIM);
kernelNodeParams1.blockDim = dim3(BLK_DIM);
kernelNodeParams1.sharedMemBytes = 0;
+93 -17
Zobrazit soubor
@@ -123,6 +123,21 @@ __global__ void kernel500ms(float* hostRes, int clkRate) {
}
}
__global__ void kernel500ms_gfx11(float* hostRes, int clkRate) {
#if HT_AMD
int tid = threadIdx.x + blockIdx.x * blockDim.x;
hostRes[tid] = tid + 1;
__threadfence_system();
// expecting that the data is getting flushed to host here!
uint64_t start = wall_clock64()/clkRate, cur;
if (clkRate > 1) {
do { cur = wall_clock64()/clkRate-start;}while (cur < wait_ms);
} else {
do { cur = wall_clock64()/start;}while (cur < wait_ms);
}
#endif
}
TEST_CASE("Unit_hipMemPoolApi_BasicAlloc") {
int mem_pool_support = 0;
HIP_CHECK(hipDeviceGetAttribute(&mem_pool_support, hipDeviceAttributeMemoryPoolsSupported, 0));
@@ -147,9 +162,14 @@ TEST_CASE("Unit_hipMemPoolApi_BasicAlloc") {
int blocks = 1024;
int clkRate;
hipMemPoolAttr attr;
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
if (IsGfx11()) {
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
} else {
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
}
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(B), stream));
@@ -229,9 +249,14 @@ TEST_CASE("Unit_hipMemPoolApi_BasicTrim") {
int blocks = 2;
int clkRate;
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
if (IsGfx11()) {
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
} else {
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
}
hipMemPoolAttr attr;
attr = hipMemPoolAttrReleaseThreshold;
@@ -312,9 +337,15 @@ TEST_CASE("Unit_hipMemPoolApi_BasicReuse") {
int blocks = 2;
int clkRate;
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
if (IsGfx11()) {
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
} else {
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
}
hipMemPoolAttr attr;
// Not a real free, since kernel isn't done
@@ -329,7 +360,11 @@ TEST_CASE("Unit_hipMemPoolApi_BasicReuse") {
HIP_CHECK(hipStreamSynchronize(stream));
// Second kernel launch with new memory
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
if (IsGfx11()) {
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
} else {
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
}
HIP_CHECK(hipStreamSynchronize(stream));
@@ -369,7 +404,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
hipMemPoolAttr attr;
int blocks = 2;
int clkRate;
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
if (IsGfx11()) {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
} else {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
}
float *A, *B, *C;
hipStream_t stream, stream2;
@@ -395,7 +434,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value));
// Run kernel for 500 ms in the first stream
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
if (IsGfx11()) {
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
} else {
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
}
// Not a real free, since kernel isn't done
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(A), stream));
@@ -410,7 +453,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
REQUIRE(A != B);
// Run kernel with the new memory in the second stream
kernel500ms<<<32, blocks, 0, stream2>>>(B, clkRate);
if (IsGfx11()) {
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
} else {
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
}
HIP_CHECK(hipStreamSynchronize(stream));
HIP_CHECK(hipStreamSynchronize(stream2));
@@ -428,7 +475,13 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value));
// Run kernel for 500 ms in the first stream
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
if (IsGfx11()) {
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
} else {
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
}
// Not a real free, since kernel isn't done
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(A), stream));
@@ -443,7 +496,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
REQUIRE(A == B);
// Run kernel with the new memory in the second stream
kernel500ms<<<32, blocks, 0, stream2>>>(B, clkRate);
if (IsGfx11()) {
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
} else {
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
}
HIP_CHECK(hipStreamSynchronize(stream));
HIP_CHECK(hipStreamSynchronize(stream2));
@@ -461,7 +518,12 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value));
// Run kernel for 500 ms in the first stream
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
if (IsGfx11()) {
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
} else {
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
}
// Not a real free, since kernel isn't done
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(A), stream));
@@ -473,7 +535,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
REQUIRE(A != B);
// Run kernel with the new memory in the second stream
kernel500ms<<<32, blocks, 0, stream2>>>(B, clkRate);
if (IsGfx11()) {
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
} else {
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
}
HIP_CHECK(hipStreamSynchronize(stream));
HIP_CHECK(hipStreamSynchronize(stream2));
@@ -510,9 +576,15 @@ TEST_CASE("Unit_hipMemPoolApi_Default") {
int blocks = 2;
int clkRate;
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
if (IsGfx11()) {
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
} else {
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
}
hipMemPoolAttr attr;
// Not a real free, since kernel isn't done
@@ -527,7 +599,11 @@ TEST_CASE("Unit_hipMemPoolApi_Default") {
HIP_CHECK(hipStreamSynchronize(stream));
// Second kernel launch with new memory
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
if (IsGfx11()) {
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
} else {
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
}
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(B), stream));
+22 -2
Zobrazit soubor
@@ -41,6 +41,21 @@ __global__ void Kernel(float* hostRes, int clkRate) {
}
}
__global__ void Kernel_gfx11(float* hostRes, int clkRate) {
#if HT_AMD
int tid = threadIdx.x + blockIdx.x * blockDim.x;
hostRes[tid] = tid + 1;
__threadfence_system();
// expecting that the data is getting flushed to host here!
uint64_t start = wall_clock64()/clkRate, cur;
if (clkRate > 1) {
do { cur = wall_clock64()/clkRate-start;}while (cur < wait_sec);
} else {
do { cur = wall_clock64()/start;}while (cur < wait_sec);
}
#endif
}
TEST_CASE("Unit_hipHostMalloc_CoherentAccess") {
int blocks = 2;
float* hostRes;
@@ -49,9 +64,14 @@ TEST_CASE("Unit_hipHostMalloc_CoherentAccess") {
hostRes[0] = 0;
hostRes[1] = 0;
int clkRate;
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
if (IsGfx11()) {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
} else {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
}
std::cout << clkRate << std::endl;
hipLaunchKernelGGL(HIP_KERNEL_NAME(Kernel), dim3(1), dim3(blocks),
auto Kernel_used = IsGfx11() ? Kernel_gfx11 : Kernel;
hipLaunchKernelGGL(HIP_KERNEL_NAME(Kernel_used), dim3(1), dim3(blocks),
0, 0, hostRes, clkRate);
HIP_CHECK(hipGetLastError());
int eleCounter = 0;
+20 -1
Zobrazit soubor
@@ -53,6 +53,24 @@ static __global__ void device_function(float* C_d, float* A_d, size_t Num) {
}
}
static __global__ void device_function_gfx11(float* C_d, float* A_d, size_t Num) {
#if HT_AMD
size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x;
for (size_t i = gputhread; i < Num; i += stride) {
C_d[i] = A_d[i] * A_d[i];
}
// Delay thread 1 only in the GPU
if (gputhread == 1) {
uint64_t wait_t = 3200000000, start = wall_clock64(), cur;
do {
cur = wall_clock64() - start;
} while (cur < wait_t);
}
#endif
}
static void HIPRT_CB Thread1_Callback(hipStream_t stream, hipError_t status,
void* userData) {
@@ -128,7 +146,8 @@ TEST_CASE("Unit_hipStreamAddCallback_MultipleThreads") {
constexpr unsigned threadsPerBlock = 256;
constexpr unsigned blocks = (N + 255)/threadsPerBlock;
hipLaunchKernelGGL((device_function), dim3(blocks),
auto device_function_used = IsGfx11() ? device_function_gfx11 : device_function;
hipLaunchKernelGGL((device_function_used), dim3(blocks),
dim3(threadsPerBlock), 0,
mystream, C_d, A_d, N);
HIP_CHECK(hipGetLastError());
+19 -4
Zobrazit soubor
@@ -94,6 +94,20 @@ __global__ void waitKernel(int clockRate, int seconds) {
}
}
__global__ void waitKernel_gfx11(int clockRate, int seconds) {
#if HT_AMD
auto start = wall_clock64();
auto ms = seconds * 1000;
long long waitTill = clockRate * (long long)ms;
while (1) {
auto end = wall_clock64();
if ((end - start) > waitTill) {
return;
}
}
#endif
}
TEST_CASE("Unit_hipStreamWaitEvent_Default") {
hipStream_t stream{nullptr};
hipEvent_t waitEvent{nullptr};
@@ -111,7 +125,8 @@ TEST_CASE("Unit_hipStreamWaitEvent_Default") {
HIP_CHECK(hipGetDeviceProperties(&prop, deviceId));
auto clockRate = prop.clockRate;
waitKernel<<<1, 1, 0, stream>>>(clockRate, 2); // Wait for 2 seconds
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
waitKernel_used<<<1, 1, 0, stream>>>(clockRate, 2); // Wait for 2 seconds
HIP_CHECK(hipEventRecord(waitEvent, stream));
@@ -145,8 +160,8 @@ TEST_CASE("Unit_hipStreamWaitEvent_DifferentStreams") {
hipDeviceProp_t prop{};
HIP_CHECK(hipGetDeviceProperties(&prop, deviceId));
auto clockRate = prop.clockRate;
waitKernel<<<1, 1, 0, blockedStreamA>>>(clockRate,
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
waitKernel_used<<<1, 1, 0, blockedStreamA>>>(clockRate,
3); // wait for 3 seconds
HIP_CHECK(hipEventRecord(waitEvent, blockedStreamA));
@@ -155,7 +170,7 @@ TEST_CASE("Unit_hipStreamWaitEvent_DifferentStreams") {
HIP_CHECK(hipStreamWaitEvent(streamBlockedOnStreamA, waitEvent, 0));
waitKernel<<<1, 1, 0, streamBlockedOnStreamA>>>(clockRate, 2); // Wait for 2 seconds
waitKernel_used<<<1, 1, 0, streamBlockedOnStreamA>>>(clockRate, 2); // Wait for 2 seconds
HIP_CHECK(hipStreamSynchronize(unblockingStream));
+77 -12
Zobrazit soubor
@@ -95,6 +95,39 @@ __global__ void StreamPerThrd1(int *A, int Pk_Clk) {
*A = 1;
}
__global__ void StreamPerThrd_gfx11(int *Ad, int *Ad1, size_t n, int Pk_Clk,
int Wait, int WaitEvnt = 0) {
#if HT_AMD
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < n) {
Ad[index] = Ad[index] + 10;
}
if (Wait) {
int64_t GpuFrq = (Pk_Clk * 1000);
int64_t StrtTck = wall_clock64();
if (index == 0) {
// The following while loop checks the value in ptr for around 4 seconds
while ((wall_clock64() - StrtTck) <= (6 * GpuFrq)) {
}
if (WaitEvnt == 1) {
*Ad1 = 1;
}
}
}
#endif
}
__global__ void StreamPerThrd1_gfx11(int *A, int Pk_Clk) {
#if HT_AMD
int64_t GpuFrq = (Pk_Clk * 1000);
int64_t StrtTck = wall_clock64();
// The following while loop checks the value in ptr for around 3-4 seconds
while ((wall_clock64() - StrtTck) <= (3 * GpuFrq)) {
}
*A = 1;
#endif
}
__global__ void MiniKernel(int *A) {
if (*A == 0) {
*A = 2; // Fail condition
@@ -189,12 +222,18 @@ static void EventSync() {
HIP_CHECK(hipEventCreate(&start));
HIP_CHECK(hipEventCreate(&end));
HIP_CHECK(hipMemcpy(Ad, Ah, NumElms * sizeof(int), hipMemcpyHostToDevice));
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
dim3 dimBlock(blockSize, 1, 1);
dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
HIP_CHECK(hipEventRecord(start, hipStreamPerThread));
StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL, NumElms,
if (IsGfx11()) {
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL, NumElms,
peak_clk, 0);
} else {
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL, NumElms,
peak_clk, 0);
}
HIP_CHECK(hipEventRecord(end, hipStreamPerThread));
HIP_CHECK(hipEventSynchronize(end));
HIP_CHECK(hipMemcpy(Ah, Ad, NumElms * sizeof(int), hipMemcpyDeviceToHost));
@@ -226,12 +265,18 @@ TEST_CASE("Unit_hipStreamPerThreadTst_StrmQuery") {
Ah[i] = CONST_NUM;
}
HIP_CHECK(hipMemcpy(Ad, Ah, NumElms * sizeof(int), hipMemcpyHostToDevice));
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
dim3 dimBlock(blockSize, 1, 1);
dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
SECTION("Test working of hipStreamQuery") {
StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
if (IsGfx11()) {
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
NumElms, peak_clk, 1);
} else {
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
NumElms, peak_clk, 1);
}
err = hipStreamQuery(hipStreamPerThread);
if (err != hipErrorNotReady) {
WARN("hipStreamQuery on hipStreamPerThread didnt return expected error!");
@@ -245,7 +290,11 @@ TEST_CASE("Unit_hipStreamPerThreadTst_StrmQuery") {
HIP_CHECK(hipHostMalloc(&Hptr, sizeof(int)));
*Hptr = 0;
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&A_d), Hptr, 0));
StreamPerThrd1<<<1, 1, 0, hipStreamPerThread>>>(A_d, peak_clk);
if (IsGfx11()) {
StreamPerThrd1_gfx11<<<1, 1, 0, hipStreamPerThread>>>(A_d, peak_clk);
} else {
StreamPerThrd1<<<1, 1, 0, hipStreamPerThread>>>(A_d, peak_clk);
}
HIP_CHECK(hipStreamAddCallback(hipStreamPerThread, CallBackFunctn, A_d, 0));
HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
HIP_CHECK(hipHostFree(Hptr));
@@ -277,11 +326,17 @@ TEST_CASE("Unit_hipStreamPerThread_MangdMem") {
hipStreamPerThread));
}
int peak_clk;
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
dim3 dimBlock(blockSize, 1, 1);
dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Hmm, NULL,
if (IsGfx11()) {
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Hmm, NULL,
NumElms, peak_clk, 0);
} else {
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Hmm, NULL,
NumElms, peak_clk, 0);
}
HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
// Validating the result
int MisMatch = 0;
@@ -313,11 +368,17 @@ TEST_CASE("Unit_hipStreamPerThread_ChildProc") {
Ah[i] = CONST_NUM;
}
HIP_CHECK(hipMemcpy(Ad, Ah, NumElms * sizeof(int), hipMemcpyHostToDevice));
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
dim3 dimBlock(blockSize, 1, 1);
dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
if (IsGfx11()) {
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
NumElms, peak_clk, 0);
} else{
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
NumElms, peak_clk, 0);
}
HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
HIP_CHECK(hipMemcpy(Ah, Ad, NumElms * sizeof(int), hipMemcpyDeviceToHost));
int MisMatch = 0;
@@ -380,13 +441,17 @@ TEST_CASE("Unit_hipStreamPerThread_StrmWaitEvt") {
HIP_CHECK(hipMalloc(&Ad1, sizeof(int)));
HIP_CHECK(hipMemset(Ad1, 0, sizeof(int)));
int peak_clk;
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
dim3 dimBlock(blockSize, 1, 1);
dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
hipEvent_t e1;
HIPCHECK(hipEventCreate(&e1));
StreamPerThrd<<<dimGrid, dimBlock, 0, Strm>>>(Ad, Ad1, NumElms,
peak_clk, 1, 1);
if (IsGfx11()) {
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, Strm>>>(Ad, Ad1, NumElms, peak_clk, 1, 1);
} else {
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
StreamPerThrd<<<dimGrid, dimBlock, 0, Strm>>>(Ad, Ad1, NumElms, peak_clk, 1, 1);
}
HIP_CHECK(hipEventRecord(e1, Strm));
HIP_CHECK(hipStreamWaitEvent(hipStreamPerThread, e1, 0 /*flags*/));
MiniKernel<<<1, 1, 0, hipStreamPerThread>>>(Ad1);