Files
Giovanni LB e4e1e4857e SWDEV-470113: Fixing hang on navi33
Change-Id: I5c1d30547039a6f23ab5974f43ea63f971573108


[ROCm/rocprofiler commit: 7e199e6b7a]
2024-07-05 20:39:56 -04:00

169 lines
4.4 KiB
C++

#include <hip/hip_runtime.h>
#include <cassert>
#include <vector>
#ifdef NDEBUG
#define HIP_ASSERT(x) x
#else
#define HIP_ASSERT(x) (assert((x) == hipSuccess))
#endif
#define WIDTH 1024
#define HEIGHT 1024
#define NUM (WIDTH * HEIGHT)
#define THREADS_PER_BLOCK_X 16
#define THREADS_PER_BLOCK_Y 16
#define THREADS_PER_BLOCK_Z 1
__device__ int counter = 0;
// empty kernel
__global__ void kernel() {}
// vector add kernel
__global__ void vectoradd_float(float* __restrict__ a, const float* __restrict__ b,
const float* __restrict__ c, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if (i < (width * height)) {
a[i] = b[i] + c[i];
}
}
__global__ void add(int n, float* x, float* y) {
if(__hip_atomic_load(&counter, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_AGENT) != 0){
abort();
}
__hip_atomic_fetch_add(&counter, 1, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_SYSTEM);
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) y[i] = x[i] + y[i];
__hip_atomic_fetch_add(&counter, -1, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_SYSTEM);
}
// launches an empty kernel in profiler context
void KernelLaunch() {
// run empty kernel
kernel<<<1, 1>>>();
hipDeviceSynchronize();
}
void LaunchMultiStreamKernels() {
int N = 1 << 4;
float* x = new float[N];
float* y = new float[N];
float* d_x;
float* d_y;
// Allocate Unified Memory -- accessible from CPU or GPU
HIP_ASSERT(hipMallocManaged(&d_x, N * sizeof(float)));
HIP_ASSERT(hipMallocManaged(&d_y, N * sizeof(float)));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
std::vector< hipStream_t> hip_streams;
for(int i = 0; i < 100; i++) {
hipStream_t stream;
hipStreamCreate (&stream);
hip_streams.push_back(stream);
}
HIP_ASSERT(hipMemcpy(d_x, x, N * sizeof(float), hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(d_y, y, N * sizeof(float), hipMemcpyHostToDevice));
// Launch kernel on 1M elements on the GPU
int blockSize = 64;
// This Kernel will always be launched with one wave
int numBlocks = 1;
for(int i = 0; i < 100; i++) {
for(int j = 0; j < hip_streams.size(); j++)
hipLaunchKernelGGL(add, numBlocks, blockSize, 0, hip_streams[j], N, d_x, d_y);
HIP_ASSERT(hipDeviceSynchronize());
}
//Wait for GPU to finish before accessing on host
HIP_ASSERT(hipDeviceSynchronize());
HIP_ASSERT(hipMemcpy(x, d_x, N * sizeof(float), hipMemcpyDeviceToHost));
HIP_ASSERT(hipMemcpy(y, d_y, N * sizeof(float), hipMemcpyDeviceToHost));
// Free memory
HIP_ASSERT(hipFree(d_x));
HIP_ASSERT(hipFree(d_y));
delete[] x;
delete[] y;
}
int LaunchVectorAddKernel() {
float* hostA;
float* hostB;
float* hostC;
float* deviceA;
float* deviceB;
float* deviceC;
hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);
int i;
int errors;
hostA = (float*)malloc(NUM * sizeof(float));
hostB = (float*)malloc(NUM * sizeof(float));
hostC = (float*)malloc(NUM * sizeof(float));
// initialize the input data
for (i = 0; i < NUM; i++) {
hostB[i] = (float)i;
hostC[i] = (float)i * 100.0f;
}
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(float)));
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(float)));
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(float)));
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM * sizeof(float), hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM * sizeof(float), hipMemcpyHostToDevice));
hipLaunchKernelGGL(vectoradd_float,
dim3(WIDTH / THREADS_PER_BLOCK_X, HEIGHT / THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, deviceA, deviceB,
deviceC, WIDTH, HEIGHT);
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM * sizeof(float), hipMemcpyDeviceToHost));
// verify the results
errors = 0;
for (i = 0; i < NUM; i++) {
if (hostA[i] != (hostB[i] + hostC[i])) {
errors++;
}
}
if (errors != 0) {
printf("FAILED: %d errors\n", errors);
}
HIP_ASSERT(hipFree(deviceA));
HIP_ASSERT(hipFree(deviceB));
HIP_ASSERT(hipFree(deviceC));
free(hostA);
free(hostB);
free(hostC);
// hipResetDefaultAccelerator();
return errors;
}