SWDEV-337452 - Changing Clock64 to WallClock64 in tests for gfx11. (#78)
Change-Id: I484fe9ff7cd56c70a37a3ac5a4a55812f8557259
[ROCm/hip-tests commit: 87fac87657]
Tento commit je obsažen v:
odevzdal
GitHub
rodič
9bc91a13b2
revize
d8b7cb28ff
@@ -140,6 +140,31 @@ static void initHipCtx(hipCtx_t* pcontext) {
|
||||
#define HIP_ARRAY hipArray*
|
||||
#endif
|
||||
|
||||
static inline bool IsGfx11() {
|
||||
#if defined(HT_NVIDIA)
|
||||
return false;
|
||||
#elif defined(HT_AMD)
|
||||
int device = -1;
|
||||
hipDeviceProp_t props{};
|
||||
HIP_CHECK(hipGetDevice(&device));
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, device));
|
||||
|
||||
// Get GCN Arch Name and compare to check if it is gfx11
|
||||
std::string arch = std::string(props.gcnArchName);
|
||||
auto pos = arch.find(":");
|
||||
if (pos != std::string::npos)
|
||||
arch = arch.substr(0, pos);
|
||||
|
||||
if(arch.size() >= 5)
|
||||
arch = arch.substr(0,5);
|
||||
|
||||
return (arch == std::string("gfx11")) ? true : false;
|
||||
#else
|
||||
std::cout<<"Have to be either Nvidia or AMD platform, asserting"<<std::endl;
|
||||
assert(false);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// Utility Functions
|
||||
namespace HipTest {
|
||||
@@ -335,6 +360,14 @@ static __global__ void waitKernel(clock_t offset) {
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void waitKernel_gfx11(clock_t offset) {
|
||||
#if HT_AMD
|
||||
auto start = wall_clock64();
|
||||
while ((wall_clock64() - start) < offset) {
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// helper function used to set the device frequency variable
|
||||
// estimates the number of clock ticks in 1 second
|
||||
static size_t findTicksPerSecond() {
|
||||
@@ -350,9 +383,9 @@ static size_t findTicksPerSecond() {
|
||||
hipEvent_t start, stop;
|
||||
HIP_CHECK(hipEventCreate(&start));
|
||||
HIP_CHECK(hipEventCreate(&stop));
|
||||
|
||||
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
|
||||
// Warmup
|
||||
hipLaunchKernelGGL(waitKernel, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
|
||||
hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
@@ -360,7 +393,7 @@ static size_t findTicksPerSecond() {
|
||||
// after 10 attempts the result is likely good enough so just accept it
|
||||
for (int attempts = 10; attempts > 0; --attempts) {
|
||||
HIP_CHECK(hipEventRecord(start));
|
||||
hipLaunchKernelGGL(waitKernel, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
|
||||
hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, 0, clockTicksPerSecond);
|
||||
HIP_CHECK(hipEventRecord(stop));
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipEventSynchronize(stop));
|
||||
@@ -396,7 +429,8 @@ static inline void runKernelForDuration(std::chrono::milliseconds duration,
|
||||
// precision so that's acceptable.
|
||||
static size_t ticksPerSecond = findTicksPerSecond();
|
||||
const auto millis = duration.count();
|
||||
hipLaunchKernelGGL(waitKernel, dim3(1), dim3(1), 0, stream, ticksPerSecond * millis / 1000);
|
||||
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
|
||||
hipLaunchKernelGGL(waitKernel_used, dim3(1), dim3(1), 0, stream, ticksPerSecond * millis / 1000);
|
||||
}
|
||||
|
||||
} // namespace HipTest
|
||||
|
||||
@@ -128,7 +128,11 @@ __global__ void Iota(T* const out, size_t pitch, size_t w, size_t h, size_t d) {
|
||||
inline void LaunchDelayKernel(const std::chrono::milliseconds interval, const hipStream_t stream) {
|
||||
int ticks_per_ms = 0;
|
||||
// Clock rate is in kHz => number of clock ticks in a millisecond
|
||||
HIP_CHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeClockRate, 0));
|
||||
if (IsGfx11()) {
|
||||
HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeWallClockRate, 0));
|
||||
} else {
|
||||
HIPCHECK(hipDeviceGetAttribute(&ticks_per_ms, hipDeviceAttributeClockRate, 0));
|
||||
}
|
||||
Delay<<<1, 1, 0, stream>>>(interval.count(), ticks_per_ms);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
|
||||
@@ -49,6 +49,19 @@ __global__ void CoherentTst(int *ptr, int PeakClk) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void CoherentTst_gfx11(int *ptr, int PeakClk) {
|
||||
#if HT_AMD
|
||||
// Incrementing the value by 1
|
||||
int64_t GpuFrq = int64_t(PeakClk) * 1000;
|
||||
int64_t StrtTck = wall_clock64();
|
||||
atomicAdd(ptr, 1);
|
||||
// The following while loop checks the value in ptr for around 3-4 seconds
|
||||
while ((wall_clock64() - StrtTck) <= (3 * GpuFrq)) {
|
||||
if (atomicCAS(ptr, 3, 4) == 3) break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__global__ void SquareKrnl(int *ptr) {
|
||||
// ptr value squared here
|
||||
*ptr = (*ptr) * (*ptr);
|
||||
@@ -64,14 +77,27 @@ static void TstCoherency(int *Ptr, bool HmmMem) {
|
||||
HIP_CHECK(hipStreamCreate(&strm));
|
||||
// storing value 1 in the memory created above
|
||||
*Ptr = 1;
|
||||
|
||||
// Getting gpu frequency
|
||||
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
|
||||
if (!HmmMem) {
|
||||
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void **>(&Dptr), Ptr,
|
||||
0));
|
||||
CoherentTst<<<1, 1, 0, strm>>>(Dptr, peak_clk);
|
||||
if (IsGfx11()) {
|
||||
HIPCHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
|
||||
} else {
|
||||
CoherentTst<<<1, 1, 0, strm>>>(Ptr, peak_clk);
|
||||
HIPCHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
|
||||
}
|
||||
|
||||
if (!HmmMem) {
|
||||
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void **>(&Dptr), Ptr, 0));
|
||||
if (IsGfx11()) {
|
||||
CoherentTst_gfx11<<<1, 1, 0, strm>>>(Dptr, peak_clk);
|
||||
} else {
|
||||
CoherentTst<<<1, 1, 0, strm>>>(Dptr, peak_clk);
|
||||
}
|
||||
} else {
|
||||
if (IsGfx11()) {
|
||||
CoherentTst_gfx11<<<1, 1, 0, strm>>>(Ptr, peak_clk);
|
||||
} else {
|
||||
CoherentTst<<<1, 1, 0, strm>>>(Ptr, peak_clk);
|
||||
}
|
||||
}
|
||||
// looping until the value is 2 for 3 seconds
|
||||
std::chrono::steady_clock::time_point start =
|
||||
|
||||
@@ -34,6 +34,20 @@ __global__ void waitKernel(int clockRate, int seconds) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void waitKernel_gfx11(int clockRate, int seconds) {
|
||||
#if HT_AMD
|
||||
auto start = wall_clock64();
|
||||
auto ms = seconds * 1000;
|
||||
long long waitTill = clockRate * (long long)ms;
|
||||
while (1) {
|
||||
auto end = wall_clock64();
|
||||
if ((end - start) > waitTill) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipEventQuery_DifferentDevice") {
|
||||
hipEvent_t event1{}, event2{};
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
@@ -54,9 +68,10 @@ TEST_CASE("Unit_hipEventQuery_DifferentDevice") {
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HIP_CHECK(hipEventRecord(event1, stream));
|
||||
|
||||
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
|
||||
// Start kernel and wait for 3 seconds
|
||||
// Make sure you increase this time if you add more tests here
|
||||
waitKernel<<<1, 1, 0, stream>>>(clockRate, 3);
|
||||
waitKernel_used<<<1, 1, 0, stream>>>(clockRate, 3);
|
||||
|
||||
HIP_CHECK(hipEventRecord(event2, stream));
|
||||
|
||||
|
||||
@@ -67,6 +67,16 @@ static __global__ void sqr_ker_func(int* a, int* b, int clockrate) {
|
||||
do { cur = clock64()/clockrate - start;}while (cur < wait_t);
|
||||
}
|
||||
|
||||
static __global__ void sqr_ker_func_gfx11(int* a, int* b, int clockrate) {
|
||||
#if HT_AMD
|
||||
int tx = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
|
||||
if (tx < LEN) b[tx] = a[tx]*a[tx];
|
||||
uint64_t wait_t = DELAY_IN_MS,
|
||||
start = wall_clock64()/clockrate, cur;
|
||||
do { cur = wall_clock64()/clockrate - start;}while (cur < wait_t);
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* Scenario 1: Test to validate setting different events in executable graph.
|
||||
*/
|
||||
@@ -106,10 +116,15 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
|
||||
inp_h, memsize, hipMemcpyHostToDevice));
|
||||
// Get device clock rate
|
||||
int clkRate = 0;
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
if (IsGfx11()) {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
|
||||
} else {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
}
|
||||
// kernel1
|
||||
auto sqr_ker_func_used = IsGfx11() ? sqr_ker_func_gfx11 : sqr_ker_func;
|
||||
void* kernelArgs[] = {&inp_d, &out_d, reinterpret_cast<void *>(&clkRate)};
|
||||
kernelNodeParams1.func = reinterpret_cast<void *>(sqr_ker_func);
|
||||
kernelNodeParams1.func = reinterpret_cast<void *>(sqr_ker_func_used);
|
||||
kernelNodeParams1.gridDim = dim3(GRID_DIM);
|
||||
kernelNodeParams1.blockDim = dim3(BLK_DIM);
|
||||
kernelNodeParams1.sharedMemBytes = 0;
|
||||
|
||||
@@ -123,6 +123,21 @@ __global__ void kernel500ms(float* hostRes, int clkRate) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kernel500ms_gfx11(float* hostRes, int clkRate) {
|
||||
#if HT_AMD
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
hostRes[tid] = tid + 1;
|
||||
__threadfence_system();
|
||||
// expecting that the data is getting flushed to host here!
|
||||
uint64_t start = wall_clock64()/clkRate, cur;
|
||||
if (clkRate > 1) {
|
||||
do { cur = wall_clock64()/clkRate-start;}while (cur < wait_ms);
|
||||
} else {
|
||||
do { cur = wall_clock64()/start;}while (cur < wait_ms);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipMemPoolApi_BasicAlloc") {
|
||||
int mem_pool_support = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&mem_pool_support, hipDeviceAttributeMemoryPoolsSupported, 0));
|
||||
@@ -147,9 +162,14 @@ TEST_CASE("Unit_hipMemPoolApi_BasicAlloc") {
|
||||
int blocks = 1024;
|
||||
int clkRate;
|
||||
hipMemPoolAttr attr;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
if (IsGfx11()) {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
|
||||
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
} else {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(B), stream));
|
||||
|
||||
@@ -229,9 +249,14 @@ TEST_CASE("Unit_hipMemPoolApi_BasicTrim") {
|
||||
|
||||
int blocks = 2;
|
||||
int clkRate;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
if (IsGfx11()) {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
|
||||
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
} else {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
}
|
||||
|
||||
hipMemPoolAttr attr;
|
||||
attr = hipMemPoolAttrReleaseThreshold;
|
||||
@@ -312,9 +337,15 @@ TEST_CASE("Unit_hipMemPoolApi_BasicReuse") {
|
||||
|
||||
int blocks = 2;
|
||||
int clkRate;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
|
||||
if (IsGfx11()) {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
|
||||
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
|
||||
} else {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
|
||||
}
|
||||
|
||||
hipMemPoolAttr attr;
|
||||
// Not a real free, since kernel isn't done
|
||||
@@ -329,7 +360,11 @@ TEST_CASE("Unit_hipMemPoolApi_BasicReuse") {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
// Second kernel launch with new memory
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
if (IsGfx11()) {
|
||||
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
} else {
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
@@ -369,7 +404,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
|
||||
hipMemPoolAttr attr;
|
||||
int blocks = 2;
|
||||
int clkRate;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
if (IsGfx11()) {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
|
||||
} else {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
}
|
||||
|
||||
float *A, *B, *C;
|
||||
hipStream_t stream, stream2;
|
||||
@@ -395,7 +434,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
|
||||
HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value));
|
||||
|
||||
// Run kernel for 500 ms in the first stream
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
|
||||
if (IsGfx11()) {
|
||||
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
|
||||
} else {
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
|
||||
}
|
||||
|
||||
// Not a real free, since kernel isn't done
|
||||
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(A), stream));
|
||||
@@ -410,7 +453,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
|
||||
REQUIRE(A != B);
|
||||
|
||||
// Run kernel with the new memory in the second stream
|
||||
kernel500ms<<<32, blocks, 0, stream2>>>(B, clkRate);
|
||||
if (IsGfx11()) {
|
||||
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
} else {
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream2));
|
||||
@@ -428,7 +475,13 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
|
||||
HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value));
|
||||
|
||||
// Run kernel for 500 ms in the first stream
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
|
||||
if (IsGfx11()) {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
|
||||
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
|
||||
} else {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
|
||||
}
|
||||
|
||||
// Not a real free, since kernel isn't done
|
||||
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(A), stream));
|
||||
@@ -443,7 +496,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
|
||||
REQUIRE(A == B);
|
||||
|
||||
// Run kernel with the new memory in the second stream
|
||||
kernel500ms<<<32, blocks, 0, stream2>>>(B, clkRate);
|
||||
if (IsGfx11()) {
|
||||
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
} else {
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream2));
|
||||
@@ -461,7 +518,12 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
|
||||
HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attr, &value));
|
||||
|
||||
// Run kernel for 500 ms in the first stream
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
|
||||
|
||||
if (IsGfx11()) {
|
||||
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
|
||||
} else {
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
|
||||
}
|
||||
|
||||
// Not a real free, since kernel isn't done
|
||||
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(A), stream));
|
||||
@@ -473,7 +535,11 @@ TEST_CASE("Unit_hipMemPoolApi_Opportunistic") {
|
||||
REQUIRE(A != B);
|
||||
|
||||
// Run kernel with the new memory in the second stream
|
||||
kernel500ms<<<32, blocks, 0, stream2>>>(B, clkRate);
|
||||
if (IsGfx11()) {
|
||||
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
} else {
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream2));
|
||||
@@ -510,9 +576,15 @@ TEST_CASE("Unit_hipMemPoolApi_Default") {
|
||||
|
||||
int blocks = 2;
|
||||
int clkRate;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
|
||||
if (IsGfx11()) {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
|
||||
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(A, clkRate);
|
||||
} else {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(A, clkRate);
|
||||
}
|
||||
|
||||
hipMemPoolAttr attr;
|
||||
// Not a real free, since kernel isn't done
|
||||
@@ -527,7 +599,11 @@ TEST_CASE("Unit_hipMemPoolApi_Default") {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
// Second kernel launch with new memory
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
if (IsGfx11()) {
|
||||
kernel500ms_gfx11<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
} else {
|
||||
kernel500ms<<<32, blocks, 0, stream>>>(B, clkRate);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipFreeAsync(reinterpret_cast<void*>(B), stream));
|
||||
|
||||
|
||||
@@ -41,6 +41,21 @@ __global__ void Kernel(float* hostRes, int clkRate) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void Kernel_gfx11(float* hostRes, int clkRate) {
|
||||
#if HT_AMD
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
hostRes[tid] = tid + 1;
|
||||
__threadfence_system();
|
||||
// expecting that the data is getting flushed to host here!
|
||||
uint64_t start = wall_clock64()/clkRate, cur;
|
||||
if (clkRate > 1) {
|
||||
do { cur = wall_clock64()/clkRate-start;}while (cur < wait_sec);
|
||||
} else {
|
||||
do { cur = wall_clock64()/start;}while (cur < wait_sec);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipHostMalloc_CoherentAccess") {
|
||||
int blocks = 2;
|
||||
float* hostRes;
|
||||
@@ -49,9 +64,14 @@ TEST_CASE("Unit_hipHostMalloc_CoherentAccess") {
|
||||
hostRes[0] = 0;
|
||||
hostRes[1] = 0;
|
||||
int clkRate;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
if (IsGfx11()) {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
|
||||
} else {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
}
|
||||
std::cout << clkRate << std::endl;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(Kernel), dim3(1), dim3(blocks),
|
||||
auto Kernel_used = IsGfx11() ? Kernel_gfx11 : Kernel;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(Kernel_used), dim3(1), dim3(blocks),
|
||||
0, 0, hostRes, clkRate);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
int eleCounter = 0;
|
||||
|
||||
@@ -53,6 +53,24 @@ static __global__ void device_function(float* C_d, float* A_d, size_t Num) {
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void device_function_gfx11(float* C_d, float* A_d, size_t Num) {
|
||||
#if HT_AMD
|
||||
size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (size_t i = gputhread; i < Num; i += stride) {
|
||||
C_d[i] = A_d[i] * A_d[i];
|
||||
}
|
||||
|
||||
// Delay thread 1 only in the GPU
|
||||
if (gputhread == 1) {
|
||||
uint64_t wait_t = 3200000000, start = wall_clock64(), cur;
|
||||
do {
|
||||
cur = wall_clock64() - start;
|
||||
} while (cur < wait_t);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void HIPRT_CB Thread1_Callback(hipStream_t stream, hipError_t status,
|
||||
void* userData) {
|
||||
@@ -128,7 +146,8 @@ TEST_CASE("Unit_hipStreamAddCallback_MultipleThreads") {
|
||||
constexpr unsigned threadsPerBlock = 256;
|
||||
constexpr unsigned blocks = (N + 255)/threadsPerBlock;
|
||||
|
||||
hipLaunchKernelGGL((device_function), dim3(blocks),
|
||||
auto device_function_used = IsGfx11() ? device_function_gfx11 : device_function;
|
||||
hipLaunchKernelGGL((device_function_used), dim3(blocks),
|
||||
dim3(threadsPerBlock), 0,
|
||||
mystream, C_d, A_d, N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
|
||||
@@ -94,6 +94,20 @@ __global__ void waitKernel(int clockRate, int seconds) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void waitKernel_gfx11(int clockRate, int seconds) {
|
||||
#if HT_AMD
|
||||
auto start = wall_clock64();
|
||||
auto ms = seconds * 1000;
|
||||
long long waitTill = clockRate * (long long)ms;
|
||||
while (1) {
|
||||
auto end = wall_clock64();
|
||||
if ((end - start) > waitTill) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipStreamWaitEvent_Default") {
|
||||
hipStream_t stream{nullptr};
|
||||
hipEvent_t waitEvent{nullptr};
|
||||
@@ -111,7 +125,8 @@ TEST_CASE("Unit_hipStreamWaitEvent_Default") {
|
||||
HIP_CHECK(hipGetDeviceProperties(&prop, deviceId));
|
||||
auto clockRate = prop.clockRate;
|
||||
|
||||
waitKernel<<<1, 1, 0, stream>>>(clockRate, 2); // Wait for 2 seconds
|
||||
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
|
||||
waitKernel_used<<<1, 1, 0, stream>>>(clockRate, 2); // Wait for 2 seconds
|
||||
|
||||
HIP_CHECK(hipEventRecord(waitEvent, stream));
|
||||
|
||||
@@ -145,8 +160,8 @@ TEST_CASE("Unit_hipStreamWaitEvent_DifferentStreams") {
|
||||
hipDeviceProp_t prop{};
|
||||
HIP_CHECK(hipGetDeviceProperties(&prop, deviceId));
|
||||
auto clockRate = prop.clockRate;
|
||||
|
||||
waitKernel<<<1, 1, 0, blockedStreamA>>>(clockRate,
|
||||
auto waitKernel_used = IsGfx11() ? waitKernel_gfx11 : waitKernel;
|
||||
waitKernel_used<<<1, 1, 0, blockedStreamA>>>(clockRate,
|
||||
3); // wait for 3 seconds
|
||||
HIP_CHECK(hipEventRecord(waitEvent, blockedStreamA));
|
||||
|
||||
@@ -155,7 +170,7 @@ TEST_CASE("Unit_hipStreamWaitEvent_DifferentStreams") {
|
||||
|
||||
HIP_CHECK(hipStreamWaitEvent(streamBlockedOnStreamA, waitEvent, 0));
|
||||
|
||||
waitKernel<<<1, 1, 0, streamBlockedOnStreamA>>>(clockRate, 2); // Wait for 2 seconds
|
||||
waitKernel_used<<<1, 1, 0, streamBlockedOnStreamA>>>(clockRate, 2); // Wait for 2 seconds
|
||||
|
||||
HIP_CHECK(hipStreamSynchronize(unblockingStream));
|
||||
|
||||
|
||||
@@ -95,6 +95,39 @@ __global__ void StreamPerThrd1(int *A, int Pk_Clk) {
|
||||
*A = 1;
|
||||
}
|
||||
|
||||
__global__ void StreamPerThrd_gfx11(int *Ad, int *Ad1, size_t n, int Pk_Clk,
|
||||
int Wait, int WaitEvnt = 0) {
|
||||
#if HT_AMD
|
||||
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (index < n) {
|
||||
Ad[index] = Ad[index] + 10;
|
||||
}
|
||||
if (Wait) {
|
||||
int64_t GpuFrq = (Pk_Clk * 1000);
|
||||
int64_t StrtTck = wall_clock64();
|
||||
if (index == 0) {
|
||||
// The following while loop checks the value in ptr for around 4 seconds
|
||||
while ((wall_clock64() - StrtTck) <= (6 * GpuFrq)) {
|
||||
}
|
||||
if (WaitEvnt == 1) {
|
||||
*Ad1 = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__global__ void StreamPerThrd1_gfx11(int *A, int Pk_Clk) {
|
||||
#if HT_AMD
|
||||
int64_t GpuFrq = (Pk_Clk * 1000);
|
||||
int64_t StrtTck = wall_clock64();
|
||||
// The following while loop checks the value in ptr for around 3-4 seconds
|
||||
while ((wall_clock64() - StrtTck) <= (3 * GpuFrq)) {
|
||||
}
|
||||
*A = 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
__global__ void MiniKernel(int *A) {
|
||||
if (*A == 0) {
|
||||
*A = 2; // Fail condition
|
||||
@@ -189,12 +222,18 @@ static void EventSync() {
|
||||
HIP_CHECK(hipEventCreate(&start));
|
||||
HIP_CHECK(hipEventCreate(&end));
|
||||
HIP_CHECK(hipMemcpy(Ad, Ah, NumElms * sizeof(int), hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
|
||||
dim3 dimBlock(blockSize, 1, 1);
|
||||
dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
|
||||
HIP_CHECK(hipEventRecord(start, hipStreamPerThread));
|
||||
StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL, NumElms,
|
||||
if (IsGfx11()) {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
|
||||
StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL, NumElms,
|
||||
peak_clk, 0);
|
||||
} else {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
|
||||
StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL, NumElms,
|
||||
peak_clk, 0);
|
||||
}
|
||||
HIP_CHECK(hipEventRecord(end, hipStreamPerThread));
|
||||
HIP_CHECK(hipEventSynchronize(end));
|
||||
HIP_CHECK(hipMemcpy(Ah, Ad, NumElms * sizeof(int), hipMemcpyDeviceToHost));
|
||||
@@ -226,12 +265,18 @@ TEST_CASE("Unit_hipStreamPerThreadTst_StrmQuery") {
|
||||
Ah[i] = CONST_NUM;
|
||||
}
|
||||
HIP_CHECK(hipMemcpy(Ad, Ah, NumElms * sizeof(int), hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
|
||||
dim3 dimBlock(blockSize, 1, 1);
|
||||
dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
|
||||
SECTION("Test working of hipStreamQuery") {
|
||||
StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
|
||||
if (IsGfx11()) {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
|
||||
StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
|
||||
NumElms, peak_clk, 1);
|
||||
} else {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
|
||||
StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
|
||||
NumElms, peak_clk, 1);
|
||||
}
|
||||
err = hipStreamQuery(hipStreamPerThread);
|
||||
if (err != hipErrorNotReady) {
|
||||
WARN("hipStreamQuery on hipStreamPerThread didnt return expected error!");
|
||||
@@ -245,7 +290,11 @@ TEST_CASE("Unit_hipStreamPerThreadTst_StrmQuery") {
|
||||
HIP_CHECK(hipHostMalloc(&Hptr, sizeof(int)));
|
||||
*Hptr = 0;
|
||||
HIP_CHECK(hipHostGetDevicePointer(reinterpret_cast<void**>(&A_d), Hptr, 0));
|
||||
StreamPerThrd1<<<1, 1, 0, hipStreamPerThread>>>(A_d, peak_clk);
|
||||
if (IsGfx11()) {
|
||||
StreamPerThrd1_gfx11<<<1, 1, 0, hipStreamPerThread>>>(A_d, peak_clk);
|
||||
} else {
|
||||
StreamPerThrd1<<<1, 1, 0, hipStreamPerThread>>>(A_d, peak_clk);
|
||||
}
|
||||
HIP_CHECK(hipStreamAddCallback(hipStreamPerThread, CallBackFunctn, A_d, 0));
|
||||
HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
|
||||
HIP_CHECK(hipHostFree(Hptr));
|
||||
@@ -277,11 +326,17 @@ TEST_CASE("Unit_hipStreamPerThread_MangdMem") {
|
||||
hipStreamPerThread));
|
||||
}
|
||||
int peak_clk;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
|
||||
dim3 dimBlock(blockSize, 1, 1);
|
||||
dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
|
||||
StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Hmm, NULL,
|
||||
if (IsGfx11()) {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
|
||||
StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Hmm, NULL,
|
||||
NumElms, peak_clk, 0);
|
||||
} else {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
|
||||
StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Hmm, NULL,
|
||||
NumElms, peak_clk, 0);
|
||||
}
|
||||
HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
|
||||
// Validating the result
|
||||
int MisMatch = 0;
|
||||
@@ -313,11 +368,17 @@ TEST_CASE("Unit_hipStreamPerThread_ChildProc") {
|
||||
Ah[i] = CONST_NUM;
|
||||
}
|
||||
HIP_CHECK(hipMemcpy(Ad, Ah, NumElms * sizeof(int), hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
|
||||
dim3 dimBlock(blockSize, 1, 1);
|
||||
dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
|
||||
StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
|
||||
if (IsGfx11()) {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
|
||||
StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
|
||||
NumElms, peak_clk, 0);
|
||||
} else{
|
||||
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
|
||||
StreamPerThrd<<<dimGrid, dimBlock, 0, hipStreamPerThread>>>(Ad, NULL,
|
||||
NumElms, peak_clk, 0);
|
||||
}
|
||||
HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
|
||||
HIP_CHECK(hipMemcpy(Ah, Ad, NumElms * sizeof(int), hipMemcpyDeviceToHost));
|
||||
int MisMatch = 0;
|
||||
@@ -380,13 +441,17 @@ TEST_CASE("Unit_hipStreamPerThread_StrmWaitEvt") {
|
||||
HIP_CHECK(hipMalloc(&Ad1, sizeof(int)));
|
||||
HIP_CHECK(hipMemset(Ad1, 0, sizeof(int)));
|
||||
int peak_clk;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
|
||||
dim3 dimBlock(blockSize, 1, 1);
|
||||
dim3 dimGrid((NumElms + blockSize -1)/blockSize, 1, 1);
|
||||
hipEvent_t e1;
|
||||
HIPCHECK(hipEventCreate(&e1));
|
||||
StreamPerThrd<<<dimGrid, dimBlock, 0, Strm>>>(Ad, Ad1, NumElms,
|
||||
peak_clk, 1, 1);
|
||||
if (IsGfx11()) {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeWallClockRate, 0));
|
||||
StreamPerThrd_gfx11<<<dimGrid, dimBlock, 0, Strm>>>(Ad, Ad1, NumElms, peak_clk, 1, 1);
|
||||
} else {
|
||||
HIP_CHECK(hipDeviceGetAttribute(&peak_clk, hipDeviceAttributeClockRate, 0));
|
||||
StreamPerThrd<<<dimGrid, dimBlock, 0, Strm>>>(Ad, Ad1, NumElms, peak_clk, 1, 1);
|
||||
}
|
||||
HIP_CHECK(hipEventRecord(e1, Strm));
|
||||
HIP_CHECK(hipStreamWaitEvent(hipStreamPerThread, e1, 0 /*flags*/));
|
||||
MiniKernel<<<1, 1, 0, hipStreamPerThread>>>(Ad1);
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele