SWDEV-337452 - Changing Clock64 to WallClock64 in directed tests. (#3140)

Change-Id: I511ab4dcc61daee4fdfbd2a248b5fe74e52174b2
Dieser Commit ist enthalten in:
ROCm CI Service Account
2023-03-21 01:48:32 +05:30
committet von GitHub
Ursprung 3fb0920a55
Commit 0ea181501c
18 geänderte Dateien mit 604 neuen und 58 gelöschten Zeilen
+22 -2
Datei anzeigen
@@ -43,6 +43,20 @@ THE SOFTWARE.
Ad[tid] = clock() + clock64() + __clock() + __clock64() - Ad[tid];
}
static __global__ void kernel1_gfx11(long long* Ad) {
#ifdef __HIP_PLATFORM_AMD__
int tid = threadIdx.x + blockIdx.x * blockDim.x;
Ad[tid] = clock() + wall_clock64() + __clock() + __clock64();
#endif
}
static __global__ void kernel2_gfx11(long long* Ad) {
#ifdef __HIP_PLATFORM_AMD__
int tid = threadIdx.x + blockIdx.x * blockDim.x;
Ad[tid] = clock() + wall_clock64() + __clock() + __clock64() - Ad[tid];
#endif
}
void run() {
long long *A, *Ad;
A = new long long[LEN];
@@ -50,9 +64,15 @@ THE SOFTWARE.
A[i] = 0;
}
auto kernel1_used = IsGfx11() ? kernel1_gfx11 : kernel1;
auto kernel2_used = IsGfx11() ? kernel2_gfx11 : kernel2;
HIP_ASSERT(hipMalloc((void**)&Ad, SIZE));
hipLaunchKernelGGL(kernel1, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
hipLaunchKernelGGL(kernel2, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
hipLaunchKernelGGL(kernel1_used, dim3(1, 1, 1),
dim3(LEN, 1, 1), 0, 0, Ad);
hipLaunchKernelGGL(kernel2_used, dim3(1, 1, 1),
dim3(LEN, 1, 1), 0, 0, Ad);
HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost));
for (unsigned i = 0; i < LEN; i++) {
@@ -124,6 +124,13 @@ __global__ void test_kernel(long long *array) {
array[rank] += clock64();
}
__global__ void test_kernel_gfx11(long long *array) {
#ifdef __HIP_PLATFORM_AMD__
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
array[rank] += wall_clock64();
#endif
}
int main(int argc, char** argv) {
hipError_t err;
int device_num, FailFlag = 0;
@@ -155,6 +162,7 @@ int main(int argc, char** argv) {
int num_sms = device_properties.multiProcessorCount;
int max_num_threads = device_properties.maxThreadsPerBlock;
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
// Check single-thread block, all numbers of warps, then too-large block
for (int block_size = 0; block_size <= (max_num_threads + warp_size);
block_size += warp_size) {
@@ -163,9 +171,8 @@ int main(int argc, char** argv) {
}
int max_blocks_per_sm;
// Calculate the device occupancy to know how many blocks can be run.
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
&max_blocks_per_sm, test_kernel, block_size, 0,
hipOccupancyDefault));
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&max_blocks_per_sm,
test_kernel_used, block_size, 0, hipOccupancyDefault));
if ((block_size > max_num_threads) && (max_blocks_per_sm != 0)) {
std::cerr << "ERROR! Occupancy API indicated that we can have >0 ";
@@ -212,7 +219,7 @@ int main(int argc, char** argv) {
coop_params[i][0] = reinterpret_cast<void*>(&dev_array[i]);
}
err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used),
2 * desired_blocks, block_size,
coop_params[0], 0, streams[0]);
@@ -235,9 +242,8 @@ int main(int argc, char** argv) {
}
HIPCHECK(hipDeviceSynchronize());
err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
desired_blocks, block_size,
coop_params[1], 0, streams[1]);
err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), desired_blocks,
block_size, coop_params[1], 0, streams[1]);
if (expect_fail) {
expect_to_see = hipErrorInvalidConfiguration;
@@ -106,6 +106,29 @@ __global__ void test_kernel(uint32_t loops, unsigned long long *array, long long
}
}
__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long *array, long long totalTicks) {
#ifdef __HIP_PLATFORM_AMD__
cooperative_groups::thread_block tb = cooperative_groups::this_thread_block();
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < loops; i++) {
long long time_diff = 0;
long long last_clock = wall_clock64();
do {
long long cur_clock = wall_clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while(time_diff < totalTicks);
tb.sync();
array[rank] += wall_clock64();
}
#endif
}
template<typename T>
bool verifyLeastCapacity(T& single_kernel_time, T& double_kernel_time, T& triple_kernel_time)
{
@@ -256,8 +279,8 @@ int main(int argc, char** argv) {
long long totalTicks = device_properties.clockRate ;
int max_blocks_per_sm = 0;
// Calculate the device occupancy to know how many blocks can be run.
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
test_kernel,
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, test_kernel_used,
warp_size, 0));
int max_active_blocks = max_blocks_per_sm * num_sms;
int coop_blocks = 0;
@@ -325,9 +348,10 @@ int main(int argc, char** argv) {
}
// Verify over capacity
HIPCHECK_API(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
HIPCHECK_API(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used),
max_active_blocks + 1, warp_size,
coop_params[0], 0, streams[0]), hipErrorCooperativeLaunchTooLarge);
coop_params[0], 0, streams[0]),
hipErrorCooperativeLaunchTooLarge);
std::cout << "Launching an initial single cooperative kernel..." << std::endl;
// We need exclude the the initial launching as it will need time to load code obj.
@@ -192,6 +192,20 @@ __global__ void second_test_kernel(long long *array) {
array[rank] += clock64();
}
__global__ void test_kernel_gfx11(long long *array) {
#ifdef __HIP_PLATFORM_AMD__
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
array[rank] += wall_clock64();
#endif
}
__global__ void second_test_kernel_gfx11(long long *array) {
#ifdef __HIP_PLATFORM_AMD__
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
array[rank] += wall_clock64();
#endif
}
int main(int argc, char** argv) {
hipError_t err;
/*************************************************************************/
@@ -245,8 +259,9 @@ int main(int argc, char** argv) {
int max_blocks_per_sm = INT_MAX;
for (int i = 0; i < 2; i++) {
HIPCHECK(hipSetDevice((dev + i)));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm_arr[i], test_kernel, warp_size, 0));
&max_blocks_per_sm_arr[i], test_kernel_used, warp_size, 0));
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
max_blocks_per_sm = max_blocks_per_sm_arr[i];
}
@@ -293,7 +308,8 @@ int main(int argc, char** argv) {
for (int i = 0; i < 2; i++) {
dev_params[i][0] = reinterpret_cast<void*>(&bad_dev_array[i]);
md_params[i].func = reinterpret_cast<void*>(test_kernel);
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
md_params[i].gridDim = 2 * desired_blocks;
md_params[i].blockDim = warp_size;
md_params[i].sharedMem = 0;
@@ -370,7 +386,8 @@ int main(int argc, char** argv) {
supports_sep_kernels = false;
}
}
md_params[1].func = reinterpret_cast<void*>(second_test_kernel);
auto second_test_kernel_used = IsGfx11() ? second_test_kernel_gfx11 : second_test_kernel;
md_params[1].func = reinterpret_cast<void*>(second_test_kernel_used);
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
if ((supports_sep_kernels && err != hipSuccess) ||
(!supports_sep_kernels && err != hipErrorInvalidValue)) {
@@ -405,7 +422,8 @@ int main(int argc, char** argv) {
std::cout << "different grid sizes." << std::endl;
bool supports_sep_sizes = true;
for (int i = 0; i < 2; i++) {
md_params[i].func = reinterpret_cast<void*>(test_kernel);
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
md_params[i].gridDim = i+1;
if (!support_for_separate_grid_sizes((dev + i))) {
supports_sep_sizes = false;
@@ -195,6 +195,55 @@ __global__ void test_kernel(uint32_t loops, unsigned long long *array) {
}
}
__global__ void test_coop_kernel_gfx11(unsigned int loops, long long *array,
int fast_gpu) {
#ifdef __HIP_PLATFORM_AMD__
cooperative_groups::multi_grid_group mgrid =
cooperative_groups::this_multi_grid();
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
if (mgrid.grid_rank() == fast_gpu) {
return;
}
for (int i = 0; i < loops; i++) {
long long time_diff = 0;
long long last_clock = wall_clock64();
do {
long long cur_clock = wall_clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while(time_diff < 1000000);
array[rank] += wall_clock64();
}
#endif
}
__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long *array) {
#ifdef __HIP_PLATFORM_AMD__
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < loops; i++) {
long long time_diff = 0;
long long last_clock = wall_clock64();
do {
long long cur_clock = wall_clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while(time_diff < 1000000);
array[rank] += wall_clock64();
}
#endif
}
int main(int argc, char** argv) {
hipError_t err;
int device_num, FailFlag = 0;
@@ -249,8 +298,9 @@ int main(int argc, char** argv) {
int max_blocks_per_sm = INT_MAX;
for (int i = 0; i < 2; i++) {
HIPCHECK(hipSetDevice(dev + i));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm_arr[i], test_kernel, warp_size, 0));
&max_blocks_per_sm_arr[i], test_kernel_used, warp_size, 0));
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
max_blocks_per_sm = max_blocks_per_sm_arr[i];
}
@@ -302,11 +352,12 @@ int main(int argc, char** argv) {
std::cout << "GPU " << dev << ": Long Coop Kernel" << std::endl;
std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel" << std::endl;
auto test_coop_kernel_used = IsGfx11() ? test_coop_kernel_gfx11 : test_coop_kernel;
for (int i = 0; i < 2; i++) {
dev_params[i][0] = reinterpret_cast<void*>(&loops);
dev_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
dev_params[i][2] = reinterpret_cast<void*>(&fast_gpu);
md_params[i].func = reinterpret_cast<void*>(test_coop_kernel);
md_params[i].func = reinterpret_cast<void*>(test_coop_kernel_used);
md_params[i].gridDim = desired_blocks;
md_params[i].blockDim = warp_size;
md_params[i].sharedMem = 0;
@@ -331,12 +382,14 @@ int main(int argc, char** argv) {
fast_gpu = 1;
start_time[1] = std::chrono::system_clock::now();
HIPCHECK(hipSetDevice(dev));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
streams[0], loops, dev_array[0]);
HIPCHECK(hipGetLastError());
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
HIPCHECK(hipSetDevice(dev + 1));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
streams[1], loops, dev_array[1]);
HIPCHECK(hipGetLastError());
for (int i = 0; i < 2; i++) {
@@ -356,12 +409,14 @@ int main(int argc, char** argv) {
fast_gpu = 0;
start_time[2] = std::chrono::system_clock::now();
HIPCHECK(hipSetDevice(dev));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
streams[0], loops, dev_array[0]);
HIPCHECK(hipGetLastError());
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
HIPCHECK(hipSetDevice(dev + 1));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
streams[1], loops, dev_array[1]);
HIPCHECK(hipGetLastError());
for (int i = 0; i < 2; i++) {
@@ -382,13 +437,15 @@ int main(int argc, char** argv) {
fast_gpu = 0;
start_time[3] = std::chrono::system_clock::now();
HIPCHECK(hipSetDevice(dev));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
streams[0], loops, dev_array[0]);
HIPCHECK(hipGetLastError());
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
hipCooperativeLaunchMultiDeviceNoPreSync));
HIPCHECK(hipSetDevice(dev + 1));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
streams[1], loops, dev_array[1]);
HIPCHECK(hipGetLastError());
for (int i = 0; i < 2; i++) {
@@ -409,13 +466,15 @@ int main(int argc, char** argv) {
fast_gpu = 1;
start_time[4] = std::chrono::system_clock::now();
HIPCHECK(hipSetDevice(dev));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
streams[0], loops, dev_array[0]);
HIPCHECK(hipGetLastError());
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
hipCooperativeLaunchMultiDeviceNoPostSync));
HIPCHECK(hipSetDevice(dev + 1));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
streams[1], loops, dev_array[1]);
for (int i = 0; i < 2; i++) {
HIPCHECK(hipSetDevice(dev + i));
@@ -434,14 +493,16 @@ int main(int argc, char** argv) {
std::cout << " Kernel\n";
start_time[5] = std::chrono::system_clock::now();
HIPCHECK(hipSetDevice(dev));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
streams[0], loops, dev_array[0]);
HIPCHECK(hipGetLastError());
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
hipCooperativeLaunchMultiDeviceNoPreSync |
hipCooperativeLaunchMultiDeviceNoPostSync));
HIPCHECK(hipSetDevice(dev + 1));
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
streams[1], loops, dev_array[1]);
HIPCHECK(hipGetLastError());
for (int i = 0; i < 2; i++) {
@@ -204,6 +204,81 @@ test_kernel(unsigned int *atomic_val, unsigned int *global_array,
}
}
__global__ void
test_kernel_gfx11(unsigned int *atomic_val, unsigned int *global_array,
unsigned int *array, uint32_t loops) {
#ifdef __HIP_PLATFORM_AMD__
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
cooperative_groups::multi_grid_group mgrid =
cooperative_groups::this_multi_grid();
unsigned rank = grid.thread_rank();
unsigned global_rank = mgrid.thread_rank();
int offset = blockIdx.x;
for (int i = 0; i < loops; i++) {
// Make the last thread run way behind everyone else.
// If the grid barrier below fails, then the other threads may hit the
// atomicInc instruction many times before the last thread ever gets
// to it.
// As such, without the barrier, the last array entry will eventually
// contain a very large value, defined by however many times the other
// wavefronts make it through this loop.
// If the barrier works, then it will likely contain some number
// near "total number of blocks". It will be the last wavefront to
// reach the atomicInc, but everyone will have only hit the atomic once.
if (rank == (grid.size() - 1)) {
long long time_diff = 0;
long long last_clock = wall_clock64();
do {
long long cur_clock = wall_clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while(time_diff < 1000000);
}
if (threadIdx.x == 0) {
array[offset] = atomicInc(atomic_val, UINT_MAX);
}
grid.sync();
// Make the last thread in the entire multi-grid run way behind
// everyone else.
// If the mgrid barrier below fails, then the two global_array entries
// will end up being out of sync, because the intermingling of adds
// and multiplies will not be aligned between to the two GPUs.
if (global_rank == (mgrid.size() - 1)) {
long long time_diff = 0;
long long last_clock = wall_clock64();
do {
long long cur_clock = wall_clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while(time_diff < 1000000);
}
// During even iterations, add into your own array entry
// During odd iterations, add into your partner's array entry
unsigned grid_rank = mgrid.grid_rank();
unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
if (rank == (grid.size() - 1)) {
if (i % mgrid.num_grids() == 0) {
global_array[grid_rank] += 2;
} else {
global_array[inter_gpu_offset] *= 2;
}
}
mgrid.sync();
offset += gridDim.x;
}
#endif
}
int main(int argc, char** argv) {
hipError_t err;
int num_devices = 0;
@@ -265,8 +340,9 @@ int main(int argc, char** argv) {
int max_blocks_per_sm = INT_MAX;
for (int i = 0; i < num_devices; i++) {
HIPCHECK(hipSetDevice(device_num[i]));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block, 0));
&max_blocks_per_sm_arr[i], test_kernel_used, num_threads_in_block, 0));
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
max_blocks_per_sm = max_blocks_per_sm_arr[i];
}
@@ -320,12 +396,13 @@ int main(int argc, char** argv) {
void *dev_params[num_devices][4];
hipLaunchParams md_params[num_devices];
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
for (int i = 0; i < num_devices; i++) {
dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
dev_params[i][1] = reinterpret_cast<void*>(&global_array);
dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
dev_params[i][3] = reinterpret_cast<void*>(&loops);
md_params[i].func = reinterpret_cast<void*>(test_kernel);
md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
md_params[i].gridDim = requested_blocks;
md_params[i].blockDim = num_threads_in_block;
md_params[i].sharedMem = 0;
@@ -131,6 +131,48 @@ test_kernel(unsigned int *atomic_val, unsigned int *array,
}
}
__global__ void
test_kernel_gfx11(unsigned int *atomic_val, unsigned int *array,
unsigned int loops) {
#ifdef __HIP_PLATFORM_AMD__
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
unsigned rank = grid.thread_rank();
int offset = blockIdx.x;
for (int i = 0; i < loops; i++) {
// Make the last thread run way behind everyone else.
// If the barrier below fails, then the other threads may hit the
// atomicInc instruction many times before the last thread ever gets
// to it.
// As such, without the barrier, the last array entry will eventually
// contain a very large value, defined by however many times the other
// wavefronts make it through this loop.
// If the barrier works, then it will likely contain some number
// near "total number of blocks". It will be the last wavefront to
// reach the atomicInc, but everyone will have only hit the atomic once.
if (rank == (grid.size() - 1)) {
long long time_diff = 0;
long long last_clock = wall_clock64();
do {
long long cur_clock = wall_clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while(time_diff < 1000000);
}
if (threadIdx.x == 0) {
array[offset] = atomicInc(&atomic_val[0], UINT_MAX);
}
grid.sync();
offset += gridDim.x;
}
#endif
}
int main(int argc, char** argv) {
hipError_t err;
int device_num;
@@ -167,9 +209,10 @@ int main(int argc, char** argv) {
int num_threads_in_block = block_size * warp_size;
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
// Calculate the device occupancy to know how many blocks can be run.
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
test_kernel, num_threads_in_block, 0));
test_kernel_used, num_threads_in_block, 0));
int requested_blocks = warps / block_size;
if (requested_blocks > max_blocks_per_sm * num_sms) {
@@ -211,7 +254,8 @@ int main(int argc, char** argv) {
params[0] = reinterpret_cast<void*>(&kernel_atomic);
params[1] = reinterpret_cast<void*>(&kernel_buffer);
params[2] = reinterpret_cast<void*>(&loops);
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used),
requested_blocks,
num_threads_in_block, params, 0, NULL));
@@ -204,6 +204,81 @@ test_kernel(unsigned int *atomic_val, unsigned int *global_array,
}
}
__global__ void
test_kernel_gfx11(unsigned int *atomic_val, unsigned int *global_array,
unsigned int *array, uint32_t loops) {
#ifdef __HIP_PLATFORM_AMD__
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
cooperative_groups::multi_grid_group mgrid =
cooperative_groups::this_multi_grid();
unsigned rank = grid.thread_rank();
unsigned global_rank = mgrid.thread_rank();
int offset = blockIdx.x;
for (int i = 0; i < loops; i++) {
// Make the last thread run way behind everyone else.
// If the grid barrier below fails, then the other threads may hit the
// atomicInc instruction many times before the last thread ever gets
// to it.
// As such, without the barrier, the last array entry will eventually
// contain a very large value, defined by however many times the other
// wavefronts make it through this loop.
// If the barrier works, then it will likely contain some number
// near "total number of blocks". It will be the last wavefront to
// reach the atomicInc, but everyone will have only hit the atomic once.
if (rank == (grid.size() - 1)) {
long long time_diff = 0;
long long last_clock = wall_clock64();
do {
long long cur_clock = wall_clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while(time_diff < 1000000);
}
if (threadIdx.x == 0) {
array[offset] = atomicInc(atomic_val, UINT_MAX);
}
grid.sync();
// Make the last thread in the entire multi-grid run way behind
// everyone else.
// If the mgrid barrier below fails, then the two global_array entries
// will end up being out of sync, because the intermingling of adds
// and multiplies will not be aligned between to the two GPUs.
if (global_rank == (mgrid.size() - 1)) {
long long time_diff = 0;
long long last_clock = wall_clock64();
do {
long long cur_clock = wall_clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while(time_diff < 1000000);
}
// During even iterations, add into your own array entry
// During odd iterations, add into your partner's array entry
unsigned grid_rank = mgrid.grid_rank();
unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
if (rank == (grid.size() - 1)) {
if (i % mgrid.num_grids() == 0) {
global_array[grid_rank] += 2;
} else {
global_array[inter_gpu_offset] *= 2;
}
}
mgrid.sync();
offset += gridDim.x;
}
#endif
}
int main(int argc, char** argv) {
hipError_t err;
int device_num = 0, flag = 0;
@@ -263,8 +338,9 @@ int main(int argc, char** argv) {
int max_blocks_per_sm = INT_MAX;
for (int i = 0; i < 2; i++) {
HIPCHECK(hipSetDevice((d + i)));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block,
&max_blocks_per_sm_arr[i], test_kernel_used, num_threads_in_block,
0));
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
max_blocks_per_sm = max_blocks_per_sm_arr[i];
@@ -319,11 +395,12 @@ int main(int argc, char** argv) {
void *dev_params[2][4];
hipLaunchParams md_params[2];
for (int i = 0; i < 2; i++) {
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
dev_params[i][1] = reinterpret_cast<void*>(&global_array);
dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
dev_params[i][3] = reinterpret_cast<void*>(&loops);
md_params[i].func = reinterpret_cast<void*>(test_kernel);
md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
md_params[i].gridDim = requested_blocks;
md_params[i].blockDim = num_threads_in_block;
md_params[i].sharedMem = 0;
@@ -44,10 +44,20 @@ extern "C" __global__ void WaitKernel(int *Ad, int clockrate) {
*Ad = 1;
}
extern "C" __global__ void WaitKernel_gfx11(int *Ad, int clockrate) {
#ifdef __HIP_PLATFORM_AMD__
uint64_t wait_t = 500,
start = wall_clock64()/clockrate, cycles;
do { cycles = wall_clock64()/clockrate-start;} while (cycles < wait_t);
*Ad = 1;
#endif
}
void t1(hipEvent_t start, hipStream_t stream1, int clkRate, int *A, int *Ad) {
*A = 0;
hipLaunchKernelGGL(HIP_KERNEL_NAME(WaitKernel), dim3(1), dim3(1), 0, stream1, Ad, clkRate);
auto WaitKernel_used = IsGfx11() ? WaitKernel_gfx11 : WaitKernel;
hipLaunchKernelGGL(HIP_KERNEL_NAME(WaitKernel_used), dim3(1), dim3(1), 0, stream1, Ad, clkRate);
HIPCHECK(hipEventRecord(start, stream1));
@@ -69,7 +79,11 @@ int main(int argc, char* argv[]) {
HIPCHECK(hipHostGetDevicePointer((void**)&Ad[i], A[i], 0));
}
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
if (IsGfx11()) {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
} else {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
}
hipStream_t stream1;
hipStreamCreate(&stream1);
hipEvent_t start;
@@ -62,6 +62,34 @@ __global__ void FourSecKernel(int clockrate) {
}
}
__global__ void TwoSecKernel_gfx11(int clockrate) {
#ifdef __HIP_PLATFORM_AMD__
if (globalvar == 0x2222) {
globalvar = 0x3333;
}
uint64_t wait_t = 2000,
start = wall_clock64()/clockrate, cur;
do { cur = (wall_clock64()/clockrate)-start;}while (cur < wait_t);
if (globalvar != 0x3333) {
globalvar = 0x5555;
}
#endif
}
__global__ void FourSecKernel_gfx11(int clockrate) {
#ifdef __HIP_PLATFORM_AMD__
if (globalvar == 1) {
globalvar = 0x2222;
}
uint64_t wait_t = 4000,
start = wall_clock64()/clockrate, cur;
do { cur = (wall_clock64()/clockrate)-start;}while (cur < wait_t);
if (globalvar == 0x2222) {
globalvar = 0x4444;
}
#endif
}
/*
* In this Scenario, we create events by disabling the timing flag
* We then Launch the kernel using hipExtModuleLaunchKernel by passing
@@ -76,13 +104,20 @@ bool DisableTimeFlag() {
float time_4sec, time_2sec;
hipEvent_t start_event1, end_event1;
int clkRate = 0;
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
if (IsGfx11()) {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
} else {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
}
HIPCHECK(hipEventCreateWithFlags(&start_event1,
hipEventDisableTiming));
HIPCHECK(hipEventCreateWithFlags(&end_event1,
hipEventDisableTiming));
HIPCHECK(hipStreamCreate(&stream1));
hipExtLaunchKernelGGL((TwoSecKernel), dim3(1), dim3(1), 0,
auto TwoSecKernel_used = IsGfx11() ? TwoSecKernel_gfx11 : TwoSecKernel;
hipExtLaunchKernelGGL((TwoSecKernel_used), dim3(1), dim3(1), 0,
stream1, start_event1, end_event1, 0, clkRate);
HIPCHECK(hipStreamSynchronize(stream1));
e = hipEventElapsedTime(&time_2sec, start_event1, end_event1);
@@ -113,11 +148,20 @@ bool ConcurencyCheck_GlobalVar(int conc_flag) {
int deviceGlobal_h = 0;
HIPCHECK(hipSetDevice(0));
int clkRate = 0;
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
if (IsGfx11()) {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
} else {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
}
HIPCHECK(hipStreamCreate(&stream1));
hipExtLaunchKernelGGL((FourSecKernel), dim3(1), dim3(1), 0,
auto TwoSecKernel_used = IsGfx11() ? TwoSecKernel_gfx11 : TwoSecKernel;
auto FourSecKernel_used = IsGfx11() ? FourSecKernel_gfx11 : FourSecKernel;
hipExtLaunchKernelGGL((FourSecKernel_used), dim3(1), dim3(1), 0,
stream1, nullptr, nullptr, conc_flag, clkRate);
hipExtLaunchKernelGGL((TwoSecKernel), dim3(1), dim3(1), 0,
hipExtLaunchKernelGGL((TwoSecKernel_used), dim3(1), dim3(1), 0,
stream1, nullptr, nullptr, conc_flag, clkRate);
HIPCHECK(hipStreamSynchronize(stream1));
HIPCHECK(hipMemcpyFromSymbol(&deviceGlobal_h, globalvar,
@@ -148,16 +192,24 @@ bool KernelTimeExecution() {
hipEvent_t start_event1, end_event1, start_event2, end_event2;
float time_4sec, time_2sec;
int clkRate = 0;
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
if (IsGfx11()) {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
} else {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
}
auto TwoSecKernel_used = IsGfx11() ? TwoSecKernel_gfx11 : TwoSecKernel;
auto FourSecKernel_used = IsGfx11() ? FourSecKernel_gfx11 : FourSecKernel;
HIPCHECK(hipEventCreate(&start_event1));
HIPCHECK(hipEventCreate(&end_event1));
HIPCHECK(hipEventCreate(&start_event2));
HIPCHECK(hipEventCreate(&end_event2));
HIPCHECK(hipStreamCreate(&stream1));
hipExtLaunchKernelGGL((FourSecKernel), dim3(1), dim3(1), 0,
hipExtLaunchKernelGGL((FourSecKernel_used), dim3(1), dim3(1), 0,
stream1, start_event1, end_event1, 0, clkRate);
hipExtLaunchKernelGGL((TwoSecKernel), dim3(1), dim3(1), 0,
hipExtLaunchKernelGGL((TwoSecKernel_used), dim3(1), dim3(1), 0,
stream1, start_event2, end_event2, 0, clkRate);
HIPCHECK(hipStreamSynchronize(stream1));
e = hipEventElapsedTime(&time_4sec, start_event1, end_event1);
@@ -49,10 +49,7 @@
#define fileName "matmul.code"
#define matmulK "matmulK"
#define SixteenSec "SixteenSecKernel"
#define KernelandExtra "KernelandExtraParams"
#define FourSec "FourSecKernel"
#define TwoSec "TwoSecKernel"
#define globalDevVar "deviceGlobal"
#define dummyKernel "dummyKernel"
#define FOURSEC_KERNEL 4999
@@ -124,7 +121,12 @@ void ModuleLaunchKernel::AllocateMemory() {
HIPCHECK(hipMemcpy(Ad, A, SIZE*sizeof(int), hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(Bd, B, SIZE*sizeof(int), hipMemcpyHostToDevice));
int clkRate = 0;
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
if (IsGfx11()) {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
} else {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
}
args1._Ad = Ad;
args1._Bd = Bd;
args1._Cd = C;
@@ -149,13 +151,21 @@ void ModuleLaunchKernel::AllocateMemory() {
}
void ModuleLaunchKernel::ModuleLoad() {
std::string TwoSecStr = IsGfx11() ? std::string("TwoSecKernel_gfx11")
: std::string("TwoSecKernel");
std::string FourSecStr = IsGfx11() ? std::string("FourSecKernel_gfx11")
: std::string("FourSecKernel");
std::string SixteenSecStr = IsGfx11() ? std::string("SixteenSecKernel_gfx11")
: std::string("SixteenSecKernel");
HIPCHECK(hipModuleLoad(&Module, fileName));
HIPCHECK(hipModuleGetFunction(&MultKernel, Module, matmulK));
HIPCHECK(hipModuleGetFunction(&SixteenSecKernel, Module, SixteenSec));
HIPCHECK(hipModuleGetFunction(&SixteenSecKernel, Module, SixteenSecStr.c_str()));
HIPCHECK(hipModuleGetFunction(&KernelandExtraParamKernel,
Module, KernelandExtra));
HIPCHECK(hipModuleGetFunction(&FourSecKernel, Module, FourSec));
HIPCHECK(hipModuleGetFunction(&TwoSecKernel, Module, TwoSec));
HIPCHECK(hipModuleGetFunction(&FourSecKernel, Module, FourSecStr.c_str()));
HIPCHECK(hipModuleGetFunction(&TwoSecKernel, Module, TwoSecStr.c_str()));
HIPCHECK(hipModuleGetFunction(&DummyKernel, Module, dummyKernel));
HIPCHECK(hipModuleGetGlobal(&deviceGlobal, &deviceGlobalSize,
Module, globalDevVar));
+36
Datei anzeigen
@@ -79,5 +79,41 @@ extern "C" __global__ void FourSecKernel(int clockrate) {
}
}
extern "C" __global__ void SixteenSecKernel_gfx11(int clockrate) {
#ifdef __HIP_PLATFORM_AMD__
uint64_t wait_t = 16000,
start = wall_clock64()/clockrate, cur;
do { cur = wall_clock64()/clockrate-start;}while (cur < wait_t);
#endif
}
extern "C" __global__ void TwoSecKernel_gfx11(int clockrate) {
#ifdef __HIP_PLATFORM_AMD__
if (deviceGlobal == 0x2222) {
deviceGlobal = 0x3333;
}
uint64_t wait_t = 2000,
start = wall_clock64()/clockrate, cur;
do { cur = wall_clock64()/clockrate-start;}while (cur < wait_t);
if (deviceGlobal != 0x3333) {
deviceGlobal = 0x5555;
}
#endif
}
extern "C" __global__ void FourSecKernel_gfx11(int clockrate) {
#ifdef __HIP_PLATFORM_AMD__
if (deviceGlobal == 1) {
deviceGlobal = 0x2222;
}
uint64_t wait_t = 4000,
start = wall_clock64()/clockrate, cur;
do { cur = wall_clock64()/clockrate-start;}while (cur < wait_t);
if (deviceGlobal == 0x2222) {
deviceGlobal = 0x4444;
}
#endif
}
extern "C" __global__ void dummyKernel() {
}
@@ -64,6 +64,25 @@ __global__ void vector_square(float* C_d, float* A_d, size_t Num) {
}
}
__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t Num) {
#ifdef __HIP_PLATFORM_AMD__
size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x;
for (size_t i = gputhread; i < Num; i += stride) {
C_d[i] = A_d[i] * A_d[i];
}
// Delay thread 1 only in the GPU
if (gputhread == 1) {
unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur;
do {
cur = wall_clock64() - start;
} while (cur < wait_t);
}
#endif
}
float *A_h, *C_h, *A_h1, *C_h1;
static void HIPRT_CB Callback_Stream1(hipStream_t stream, hipError_t status,
@@ -129,8 +148,9 @@ int main(int argc, char* argv[]) {
int *ptr = NULL;
int *ptr1 = NULL;
// Queing jobs in both mystream1/2 followed by hipStreamAddCallback
auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square;
for (int i = 1; i < 5; ++i) {
hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock),
hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock),
0, mystream1, C_d, A_d, Num);
HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost,
mystream1));
@@ -139,7 +159,7 @@ int main(int argc, char* argv[]) {
HIPCHECK(hipStreamAddCallback(mystream1, Callback_Stream1,
reinterpret_cast<void*>(ptr), 0));
hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock),
hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock),
0, mystream2, C_d, A_d, Num);
HIPCHECK(hipMemcpyAsync(C_h1, C_d, Nbytes,
hipMemcpyDeviceToHost, mystream2));
@@ -64,6 +64,25 @@ __global__ void vector_square(float* C_d, float* A_d, size_t Num) {
}
}
__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t Num) {
#ifdef __HIP_PLATFORM_AMD__
size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x;
for (size_t i = gputhread; i < Num; i += stride) {
C_d[i] = A_d[i] * A_d[i];
}
// Delay thread 1 only in the GPU
if (gputhread == 1) {
unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur;
do {
cur = wall_clock64() - start;
} while (cur < wait_t);
}
#endif
}
static void HIPRT_CB Stream_Callback(hipStream_t stream, hipError_t status,
void* userData) {
for (size_t i = 0; i < Num; i++) {
@@ -100,7 +119,8 @@ int main(int argc, char* argv[]) {
const unsigned threadsPerBlock = 256;
const unsigned blocks = (Num+255)/threadsPerBlock;
hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0,
auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square;
hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0,
mystream, C_d, A_d, Num);
HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
@@ -64,6 +64,24 @@ __global__ void vector_square(float* C_d, float* A_d, size_t Num) {
}
}
__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t Num) {
#ifdef __HIP_PLATFORM_AMD__
size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x;
for (size_t i = gputhread; i < Num; i += stride) {
C_d[i] = A_d[i] * A_d[i];
}
// Delay thread 1 only in the GPU
if (gputhread == 1) {
unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur;
do {
cur = wall_clock64() - start;
} while (cur < wait_t);
}
#endif
}
static void HIPRT_CB Thread1_Callback(hipStream_t stream, hipError_t status,
void* userData) {
@@ -124,7 +142,8 @@ int main(int argc, char* argv[]) {
const unsigned threadsPerBlock = 256;
const unsigned blocks = (Num+255)/threadsPerBlock;
hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0,
auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square;
hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0,
mystream, C_d, A_d, Num);
HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
@@ -61,6 +61,25 @@ __global__ void vector_square(float* C_d, float* A_d, size_t N_elmts) {
}
}
__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t N_elmts) {
#ifdef __HIP_PLATFORM_AMD__
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x;
for (size_t i = offset; i < N_elmts; i += stride) {
C_d[i] = A_d[i] * A_d[i];
}
// Delay the thread 1
if (offset == 1) {
unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur;
do {
cur = wall_clock64() - start;
} while (cur < wait_t);
}
#endif
}
float *A_h, *C_h;
static void HIPRT_CB Callback1(hipStream_t stream, hipError_t status,
@@ -102,7 +121,8 @@ int main(int argc, char* argv[]) {
const unsigned threadsPerBlock = 256;
const unsigned blocks = (N_elmts + 255)/threadsPerBlock;
hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0,
auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square;
hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0,
mystream, C_d, A_d, N_elmts);
HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
HIPCHECK(hipStreamAddCallback(mystream, Callback1, NULL, 0));
+26 -1
Datei anzeigen
@@ -92,6 +92,32 @@ size_t getHostThreadCount(const size_t memPerThread, const size_t maxThreads) {
return thread_count;
}
// Function to determine if the device is of gfx11 architecture
bool IsGfx11() {
#if defined(__HIP_PLATFORM_NVIDIA__)
return false;
#elif defined(__HIP_PLATFORM_AMD__)
int device = -1;
hipDeviceProp_t props{};
HIPCHECK(hipGetDevice(&device));
HIPCHECK(hipGetDeviceProperties(&props, device));
// Get GCN Arch Name and compare to check if it is gfx11
std::string arch = std::string(props.gcnArchName);
auto pos = arch.find(":");
if (pos != std::string::npos)
arch = arch.substr(0, pos);
if(arch.size() >= 5)
arch = arch.substr(0,5);
return (arch == std::string("gfx11")) ? true : false;
#else
std::cout<<"Have to be either Nvidia or AMD platform, asserting"<<std::endl;
assert(false);
#endif
}
namespace HipTest {
@@ -232,5 +258,4 @@ unsigned setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N)
return blocks;
}
} // namespace HipTest
+3
Datei anzeigen
@@ -581,3 +581,6 @@ inline bool isImageSupported() {
if (!HipTest::isImageSupported()) \
{ printf("Texture is not support on the device. Skipped.\n"); passed(); }
#endif //__cplusplus
// Function to determine if the device is of gfx11 architecture
bool IsGfx11();