SWDEV-337452 - Changing Clock64 to WallClock64 in directed tests. (#3140)
Change-Id: I511ab4dcc61daee4fdfbd2a248b5fe74e52174b2
Dieser Commit ist enthalten in:
committet von
GitHub
Ursprung
3fb0920a55
Commit
0ea181501c
@@ -43,6 +43,20 @@ THE SOFTWARE.
|
||||
Ad[tid] = clock() + clock64() + __clock() + __clock64() - Ad[tid];
|
||||
}
|
||||
|
||||
static __global__ void kernel1_gfx11(long long* Ad) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Ad[tid] = clock() + wall_clock64() + __clock() + __clock64();
|
||||
#endif
|
||||
}
|
||||
|
||||
static __global__ void kernel2_gfx11(long long* Ad) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
Ad[tid] = clock() + wall_clock64() + __clock() + __clock64() - Ad[tid];
|
||||
#endif
|
||||
}
|
||||
|
||||
void run() {
|
||||
long long *A, *Ad;
|
||||
A = new long long[LEN];
|
||||
@@ -50,9 +64,15 @@ THE SOFTWARE.
|
||||
A[i] = 0;
|
||||
}
|
||||
|
||||
auto kernel1_used = IsGfx11() ? kernel1_gfx11 : kernel1;
|
||||
auto kernel2_used = IsGfx11() ? kernel2_gfx11 : kernel2;
|
||||
|
||||
HIP_ASSERT(hipMalloc((void**)&Ad, SIZE));
|
||||
hipLaunchKernelGGL(kernel1, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
|
||||
hipLaunchKernelGGL(kernel2, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, Ad);
|
||||
|
||||
hipLaunchKernelGGL(kernel1_used, dim3(1, 1, 1),
|
||||
dim3(LEN, 1, 1), 0, 0, Ad);
|
||||
hipLaunchKernelGGL(kernel2_used, dim3(1, 1, 1),
|
||||
dim3(LEN, 1, 1), 0, 0, Ad);
|
||||
HIP_ASSERT(hipMemcpy(A, Ad, SIZE, hipMemcpyDeviceToHost));
|
||||
|
||||
for (unsigned i = 0; i < LEN; i++) {
|
||||
|
||||
@@ -124,6 +124,13 @@ __global__ void test_kernel(long long *array) {
|
||||
array[rank] += clock64();
|
||||
}
|
||||
|
||||
__global__ void test_kernel_gfx11(long long *array) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
array[rank] += wall_clock64();
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
hipError_t err;
|
||||
int device_num, FailFlag = 0;
|
||||
@@ -155,6 +162,7 @@ int main(int argc, char** argv) {
|
||||
int num_sms = device_properties.multiProcessorCount;
|
||||
int max_num_threads = device_properties.maxThreadsPerBlock;
|
||||
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
// Check single-thread block, all numbers of warps, then too-large block
|
||||
for (int block_size = 0; block_size <= (max_num_threads + warp_size);
|
||||
block_size += warp_size) {
|
||||
@@ -163,9 +171,8 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
int max_blocks_per_sm;
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
&max_blocks_per_sm, test_kernel, block_size, 0,
|
||||
hipOccupancyDefault));
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&max_blocks_per_sm,
|
||||
test_kernel_used, block_size, 0, hipOccupancyDefault));
|
||||
|
||||
if ((block_size > max_num_threads) && (max_blocks_per_sm != 0)) {
|
||||
std::cerr << "ERROR! Occupancy API indicated that we can have >0 ";
|
||||
@@ -212,7 +219,7 @@ int main(int argc, char** argv) {
|
||||
coop_params[i][0] = reinterpret_cast<void*>(&dev_array[i]);
|
||||
}
|
||||
|
||||
err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
|
||||
err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used),
|
||||
2 * desired_blocks, block_size,
|
||||
coop_params[0], 0, streams[0]);
|
||||
|
||||
@@ -235,9 +242,8 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
|
||||
desired_blocks, block_size,
|
||||
coop_params[1], 0, streams[1]);
|
||||
err = hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), desired_blocks,
|
||||
block_size, coop_params[1], 0, streams[1]);
|
||||
|
||||
if (expect_fail) {
|
||||
expect_to_see = hipErrorInvalidConfiguration;
|
||||
|
||||
@@ -106,6 +106,29 @@ __global__ void test_kernel(uint32_t loops, unsigned long long *array, long long
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long *array, long long totalTicks) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
cooperative_groups::thread_block tb = cooperative_groups::this_thread_block();
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = wall_clock64();
|
||||
do {
|
||||
long long cur_clock = wall_clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while(time_diff < totalTicks);
|
||||
tb.sync();
|
||||
array[rank] += wall_clock64();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
bool verifyLeastCapacity(T& single_kernel_time, T& double_kernel_time, T& triple_kernel_time)
|
||||
{
|
||||
@@ -256,8 +279,8 @@ int main(int argc, char** argv) {
|
||||
long long totalTicks = device_properties.clockRate ;
|
||||
int max_blocks_per_sm = 0;
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
|
||||
test_kernel,
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, test_kernel_used,
|
||||
warp_size, 0));
|
||||
int max_active_blocks = max_blocks_per_sm * num_sms;
|
||||
int coop_blocks = 0;
|
||||
@@ -325,9 +348,10 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
|
||||
// Verify over capacity
|
||||
HIPCHECK_API(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
|
||||
HIPCHECK_API(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used),
|
||||
max_active_blocks + 1, warp_size,
|
||||
coop_params[0], 0, streams[0]), hipErrorCooperativeLaunchTooLarge);
|
||||
coop_params[0], 0, streams[0]),
|
||||
hipErrorCooperativeLaunchTooLarge);
|
||||
|
||||
std::cout << "Launching an initial single cooperative kernel..." << std::endl;
|
||||
// We need exclude the the initial launching as it will need time to load code obj.
|
||||
|
||||
@@ -192,6 +192,20 @@ __global__ void second_test_kernel(long long *array) {
|
||||
array[rank] += clock64();
|
||||
}
|
||||
|
||||
__global__ void test_kernel_gfx11(long long *array) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
array[rank] += wall_clock64();
|
||||
#endif
|
||||
}
|
||||
|
||||
__global__ void second_test_kernel_gfx11(long long *array) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
array[rank] += wall_clock64();
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
hipError_t err;
|
||||
/*************************************************************************/
|
||||
@@ -245,8 +259,9 @@ int main(int argc, char** argv) {
|
||||
int max_blocks_per_sm = INT_MAX;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipSetDevice((dev + i)));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm_arr[i], test_kernel, warp_size, 0));
|
||||
&max_blocks_per_sm_arr[i], test_kernel_used, warp_size, 0));
|
||||
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
|
||||
max_blocks_per_sm = max_blocks_per_sm_arr[i];
|
||||
}
|
||||
@@ -293,7 +308,8 @@ int main(int argc, char** argv) {
|
||||
for (int i = 0; i < 2; i++) {
|
||||
dev_params[i][0] = reinterpret_cast<void*>(&bad_dev_array[i]);
|
||||
|
||||
md_params[i].func = reinterpret_cast<void*>(test_kernel);
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
|
||||
md_params[i].gridDim = 2 * desired_blocks;
|
||||
md_params[i].blockDim = warp_size;
|
||||
md_params[i].sharedMem = 0;
|
||||
@@ -370,7 +386,8 @@ int main(int argc, char** argv) {
|
||||
supports_sep_kernels = false;
|
||||
}
|
||||
}
|
||||
md_params[1].func = reinterpret_cast<void*>(second_test_kernel);
|
||||
auto second_test_kernel_used = IsGfx11() ? second_test_kernel_gfx11 : second_test_kernel;
|
||||
md_params[1].func = reinterpret_cast<void*>(second_test_kernel_used);
|
||||
err = hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0);
|
||||
if ((supports_sep_kernels && err != hipSuccess) ||
|
||||
(!supports_sep_kernels && err != hipErrorInvalidValue)) {
|
||||
@@ -405,7 +422,8 @@ int main(int argc, char** argv) {
|
||||
std::cout << "different grid sizes." << std::endl;
|
||||
bool supports_sep_sizes = true;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
md_params[i].func = reinterpret_cast<void*>(test_kernel);
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
|
||||
md_params[i].gridDim = i+1;
|
||||
if (!support_for_separate_grid_sizes((dev + i))) {
|
||||
supports_sep_sizes = false;
|
||||
|
||||
@@ -195,6 +195,55 @@ __global__ void test_kernel(uint32_t loops, unsigned long long *array) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void test_coop_kernel_gfx11(unsigned int loops, long long *array,
|
||||
int fast_gpu) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
cooperative_groups::multi_grid_group mgrid =
|
||||
cooperative_groups::this_multi_grid();
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (mgrid.grid_rank() == fast_gpu) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = wall_clock64();
|
||||
do {
|
||||
long long cur_clock = wall_clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while(time_diff < 1000000);
|
||||
array[rank] += wall_clock64();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long *array) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = wall_clock64();
|
||||
do {
|
||||
long long cur_clock = wall_clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while(time_diff < 1000000);
|
||||
array[rank] += wall_clock64();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
hipError_t err;
|
||||
int device_num, FailFlag = 0;
|
||||
@@ -249,8 +298,9 @@ int main(int argc, char** argv) {
|
||||
int max_blocks_per_sm = INT_MAX;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipSetDevice(dev + i));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm_arr[i], test_kernel, warp_size, 0));
|
||||
&max_blocks_per_sm_arr[i], test_kernel_used, warp_size, 0));
|
||||
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
|
||||
max_blocks_per_sm = max_blocks_per_sm_arr[i];
|
||||
}
|
||||
@@ -302,11 +352,12 @@ int main(int argc, char** argv) {
|
||||
std::cout << "GPU " << dev << ": Long Coop Kernel" << std::endl;
|
||||
std::cout << "GPU " << (dev + 1) << ": Long Coop Kernel" << std::endl;
|
||||
|
||||
auto test_coop_kernel_used = IsGfx11() ? test_coop_kernel_gfx11 : test_coop_kernel;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
dev_params[i][0] = reinterpret_cast<void*>(&loops);
|
||||
dev_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
|
||||
dev_params[i][2] = reinterpret_cast<void*>(&fast_gpu);
|
||||
md_params[i].func = reinterpret_cast<void*>(test_coop_kernel);
|
||||
md_params[i].func = reinterpret_cast<void*>(test_coop_kernel_used);
|
||||
md_params[i].gridDim = desired_blocks;
|
||||
md_params[i].blockDim = warp_size;
|
||||
md_params[i].sharedMem = 0;
|
||||
@@ -331,12 +382,14 @@ int main(int argc, char** argv) {
|
||||
fast_gpu = 1;
|
||||
start_time[1] = std::chrono::system_clock::now();
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[0], loops, dev_array[0]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
|
||||
HIPCHECK(hipSetDevice(dev + 1));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[1], loops, dev_array[1]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
@@ -356,12 +409,14 @@ int main(int argc, char** argv) {
|
||||
fast_gpu = 0;
|
||||
start_time[2] = std::chrono::system_clock::now();
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[0], loops, dev_array[0]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
|
||||
HIPCHECK(hipSetDevice(dev + 1));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[1], loops, dev_array[1]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
@@ -382,13 +437,15 @@ int main(int argc, char** argv) {
|
||||
fast_gpu = 0;
|
||||
start_time[3] = std::chrono::system_clock::now();
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[0], loops, dev_array[0]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
|
||||
hipCooperativeLaunchMultiDeviceNoPreSync));
|
||||
HIPCHECK(hipSetDevice(dev + 1));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[1], loops, dev_array[1]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
@@ -409,13 +466,15 @@ int main(int argc, char** argv) {
|
||||
fast_gpu = 1;
|
||||
start_time[4] = std::chrono::system_clock::now();
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[0], loops, dev_array[0]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
|
||||
hipCooperativeLaunchMultiDeviceNoPostSync));
|
||||
HIPCHECK(hipSetDevice(dev + 1));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[1], loops, dev_array[1]);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipSetDevice(dev + i));
|
||||
@@ -434,14 +493,16 @@ int main(int argc, char** argv) {
|
||||
std::cout << " Kernel\n";
|
||||
start_time[5] = std::chrono::system_clock::now();
|
||||
HIPCHECK(hipSetDevice(dev));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[0], loops, dev_array[0]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
|
||||
hipCooperativeLaunchMultiDeviceNoPreSync |
|
||||
hipCooperativeLaunchMultiDeviceNoPostSync));
|
||||
HIPCHECK(hipSetDevice(dev + 1));
|
||||
hipLaunchKernelGGL(test_kernel, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0,
|
||||
streams[1], loops, dev_array[1]);
|
||||
HIPCHECK(hipGetLastError());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
|
||||
@@ -204,6 +204,81 @@ test_kernel(unsigned int *atomic_val, unsigned int *global_array,
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
test_kernel_gfx11(unsigned int *atomic_val, unsigned int *global_array,
|
||||
unsigned int *array, uint32_t loops) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
|
||||
cooperative_groups::multi_grid_group mgrid =
|
||||
cooperative_groups::this_multi_grid();
|
||||
unsigned rank = grid.thread_rank();
|
||||
unsigned global_rank = mgrid.thread_rank();
|
||||
|
||||
int offset = blockIdx.x;
|
||||
for (int i = 0; i < loops; i++) {
|
||||
// Make the last thread run way behind everyone else.
|
||||
// If the grid barrier below fails, then the other threads may hit the
|
||||
// atomicInc instruction many times before the last thread ever gets
|
||||
// to it.
|
||||
// As such, without the barrier, the last array entry will eventually
|
||||
// contain a very large value, defined by however many times the other
|
||||
// wavefronts make it through this loop.
|
||||
// If the barrier works, then it will likely contain some number
|
||||
// near "total number of blocks". It will be the last wavefront to
|
||||
// reach the atomicInc, but everyone will have only hit the atomic once.
|
||||
if (rank == (grid.size() - 1)) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = wall_clock64();
|
||||
do {
|
||||
long long cur_clock = wall_clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while(time_diff < 1000000);
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
array[offset] = atomicInc(atomic_val, UINT_MAX);
|
||||
}
|
||||
grid.sync();
|
||||
|
||||
// Make the last thread in the entire multi-grid run way behind
|
||||
// everyone else.
|
||||
// If the mgrid barrier below fails, then the two global_array entries
|
||||
// will end up being out of sync, because the intermingling of adds
|
||||
// and multiplies will not be aligned between to the two GPUs.
|
||||
if (global_rank == (mgrid.size() - 1)) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = wall_clock64();
|
||||
do {
|
||||
long long cur_clock = wall_clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while(time_diff < 1000000);
|
||||
}
|
||||
// During even iterations, add into your own array entry
|
||||
// During odd iterations, add into your partner's array entry
|
||||
unsigned grid_rank = mgrid.grid_rank();
|
||||
unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
|
||||
if (rank == (grid.size() - 1)) {
|
||||
if (i % mgrid.num_grids() == 0) {
|
||||
global_array[grid_rank] += 2;
|
||||
} else {
|
||||
global_array[inter_gpu_offset] *= 2;
|
||||
}
|
||||
}
|
||||
mgrid.sync();
|
||||
offset += gridDim.x;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
hipError_t err;
|
||||
int num_devices = 0;
|
||||
@@ -265,8 +340,9 @@ int main(int argc, char** argv) {
|
||||
int max_blocks_per_sm = INT_MAX;
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
HIPCHECK(hipSetDevice(device_num[i]));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block, 0));
|
||||
&max_blocks_per_sm_arr[i], test_kernel_used, num_threads_in_block, 0));
|
||||
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
|
||||
max_blocks_per_sm = max_blocks_per_sm_arr[i];
|
||||
}
|
||||
@@ -320,12 +396,13 @@ int main(int argc, char** argv) {
|
||||
|
||||
void *dev_params[num_devices][4];
|
||||
hipLaunchParams md_params[num_devices];
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
|
||||
dev_params[i][1] = reinterpret_cast<void*>(&global_array);
|
||||
dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
|
||||
dev_params[i][3] = reinterpret_cast<void*>(&loops);
|
||||
md_params[i].func = reinterpret_cast<void*>(test_kernel);
|
||||
md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
|
||||
md_params[i].gridDim = requested_blocks;
|
||||
md_params[i].blockDim = num_threads_in_block;
|
||||
md_params[i].sharedMem = 0;
|
||||
|
||||
@@ -131,6 +131,48 @@ test_kernel(unsigned int *atomic_val, unsigned int *array,
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
test_kernel_gfx11(unsigned int *atomic_val, unsigned int *array,
|
||||
unsigned int loops) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
|
||||
unsigned rank = grid.thread_rank();
|
||||
|
||||
int offset = blockIdx.x;
|
||||
for (int i = 0; i < loops; i++) {
|
||||
// Make the last thread run way behind everyone else.
|
||||
// If the barrier below fails, then the other threads may hit the
|
||||
// atomicInc instruction many times before the last thread ever gets
|
||||
// to it.
|
||||
// As such, without the barrier, the last array entry will eventually
|
||||
// contain a very large value, defined by however many times the other
|
||||
// wavefronts make it through this loop.
|
||||
// If the barrier works, then it will likely contain some number
|
||||
// near "total number of blocks". It will be the last wavefront to
|
||||
// reach the atomicInc, but everyone will have only hit the atomic once.
|
||||
if (rank == (grid.size() - 1)) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = wall_clock64();
|
||||
do {
|
||||
long long cur_clock = wall_clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while(time_diff < 1000000);
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
array[offset] = atomicInc(&atomic_val[0], UINT_MAX);
|
||||
}
|
||||
grid.sync();
|
||||
offset += gridDim.x;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
hipError_t err;
|
||||
int device_num;
|
||||
@@ -167,9 +209,10 @@ int main(int argc, char** argv) {
|
||||
|
||||
int num_threads_in_block = block_size * warp_size;
|
||||
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm,
|
||||
test_kernel, num_threads_in_block, 0));
|
||||
test_kernel_used, num_threads_in_block, 0));
|
||||
|
||||
int requested_blocks = warps / block_size;
|
||||
if (requested_blocks > max_blocks_per_sm * num_sms) {
|
||||
@@ -211,7 +254,8 @@ int main(int argc, char** argv) {
|
||||
params[0] = reinterpret_cast<void*>(&kernel_atomic);
|
||||
params[1] = reinterpret_cast<void*>(&kernel_buffer);
|
||||
params[2] = reinterpret_cast<void*>(&loops);
|
||||
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel),
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
HIPCHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used),
|
||||
requested_blocks,
|
||||
num_threads_in_block, params, 0, NULL));
|
||||
|
||||
|
||||
@@ -204,6 +204,81 @@ test_kernel(unsigned int *atomic_val, unsigned int *global_array,
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
test_kernel_gfx11(unsigned int *atomic_val, unsigned int *global_array,
|
||||
unsigned int *array, uint32_t loops) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
|
||||
cooperative_groups::multi_grid_group mgrid =
|
||||
cooperative_groups::this_multi_grid();
|
||||
unsigned rank = grid.thread_rank();
|
||||
unsigned global_rank = mgrid.thread_rank();
|
||||
|
||||
int offset = blockIdx.x;
|
||||
for (int i = 0; i < loops; i++) {
|
||||
// Make the last thread run way behind everyone else.
|
||||
// If the grid barrier below fails, then the other threads may hit the
|
||||
// atomicInc instruction many times before the last thread ever gets
|
||||
// to it.
|
||||
// As such, without the barrier, the last array entry will eventually
|
||||
// contain a very large value, defined by however many times the other
|
||||
// wavefronts make it through this loop.
|
||||
// If the barrier works, then it will likely contain some number
|
||||
// near "total number of blocks". It will be the last wavefront to
|
||||
// reach the atomicInc, but everyone will have only hit the atomic once.
|
||||
if (rank == (grid.size() - 1)) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = wall_clock64();
|
||||
do {
|
||||
long long cur_clock = wall_clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while(time_diff < 1000000);
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
array[offset] = atomicInc(atomic_val, UINT_MAX);
|
||||
}
|
||||
grid.sync();
|
||||
|
||||
// Make the last thread in the entire multi-grid run way behind
|
||||
// everyone else.
|
||||
// If the mgrid barrier below fails, then the two global_array entries
|
||||
// will end up being out of sync, because the intermingling of adds
|
||||
// and multiplies will not be aligned between to the two GPUs.
|
||||
if (global_rank == (mgrid.size() - 1)) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = wall_clock64();
|
||||
do {
|
||||
long long cur_clock = wall_clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while(time_diff < 1000000);
|
||||
}
|
||||
// During even iterations, add into your own array entry
|
||||
// During odd iterations, add into your partner's array entry
|
||||
unsigned grid_rank = mgrid.grid_rank();
|
||||
unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
|
||||
if (rank == (grid.size() - 1)) {
|
||||
if (i % mgrid.num_grids() == 0) {
|
||||
global_array[grid_rank] += 2;
|
||||
} else {
|
||||
global_array[inter_gpu_offset] *= 2;
|
||||
}
|
||||
}
|
||||
mgrid.sync();
|
||||
offset += gridDim.x;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
hipError_t err;
|
||||
int device_num = 0, flag = 0;
|
||||
@@ -263,8 +338,9 @@ int main(int argc, char** argv) {
|
||||
int max_blocks_per_sm = INT_MAX;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIPCHECK(hipSetDevice((d + i)));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block,
|
||||
&max_blocks_per_sm_arr[i], test_kernel_used, num_threads_in_block,
|
||||
0));
|
||||
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
|
||||
max_blocks_per_sm = max_blocks_per_sm_arr[i];
|
||||
@@ -319,11 +395,12 @@ int main(int argc, char** argv) {
|
||||
void *dev_params[2][4];
|
||||
hipLaunchParams md_params[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
|
||||
dev_params[i][1] = reinterpret_cast<void*>(&global_array);
|
||||
dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
|
||||
dev_params[i][3] = reinterpret_cast<void*>(&loops);
|
||||
md_params[i].func = reinterpret_cast<void*>(test_kernel);
|
||||
md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
|
||||
md_params[i].gridDim = requested_blocks;
|
||||
md_params[i].blockDim = num_threads_in_block;
|
||||
md_params[i].sharedMem = 0;
|
||||
|
||||
@@ -44,10 +44,20 @@ extern "C" __global__ void WaitKernel(int *Ad, int clockrate) {
|
||||
*Ad = 1;
|
||||
}
|
||||
|
||||
extern "C" __global__ void WaitKernel_gfx11(int *Ad, int clockrate) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
uint64_t wait_t = 500,
|
||||
start = wall_clock64()/clockrate, cycles;
|
||||
do { cycles = wall_clock64()/clockrate-start;} while (cycles < wait_t);
|
||||
*Ad = 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
void t1(hipEvent_t start, hipStream_t stream1, int clkRate, int *A, int *Ad) {
|
||||
*A = 0;
|
||||
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(WaitKernel), dim3(1), dim3(1), 0, stream1, Ad, clkRate);
|
||||
auto WaitKernel_used = IsGfx11() ? WaitKernel_gfx11 : WaitKernel;
|
||||
hipLaunchKernelGGL(HIP_KERNEL_NAME(WaitKernel_used), dim3(1), dim3(1), 0, stream1, Ad, clkRate);
|
||||
|
||||
HIPCHECK(hipEventRecord(start, stream1));
|
||||
|
||||
@@ -69,7 +79,11 @@ int main(int argc, char* argv[]) {
|
||||
HIPCHECK(hipHostGetDevicePointer((void**)&Ad[i], A[i], 0));
|
||||
}
|
||||
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
if (IsGfx11()) {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
|
||||
} else {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
}
|
||||
hipStream_t stream1;
|
||||
hipStreamCreate(&stream1);
|
||||
hipEvent_t start;
|
||||
|
||||
@@ -62,6 +62,34 @@ __global__ void FourSecKernel(int clockrate) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void TwoSecKernel_gfx11(int clockrate) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
if (globalvar == 0x2222) {
|
||||
globalvar = 0x3333;
|
||||
}
|
||||
uint64_t wait_t = 2000,
|
||||
start = wall_clock64()/clockrate, cur;
|
||||
do { cur = (wall_clock64()/clockrate)-start;}while (cur < wait_t);
|
||||
if (globalvar != 0x3333) {
|
||||
globalvar = 0x5555;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__global__ void FourSecKernel_gfx11(int clockrate) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
if (globalvar == 1) {
|
||||
globalvar = 0x2222;
|
||||
}
|
||||
uint64_t wait_t = 4000,
|
||||
start = wall_clock64()/clockrate, cur;
|
||||
do { cur = (wall_clock64()/clockrate)-start;}while (cur < wait_t);
|
||||
if (globalvar == 0x2222) {
|
||||
globalvar = 0x4444;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* In this Scenario, we create events by disabling the timing flag
|
||||
* We then Launch the kernel using hipExtModuleLaunchKernel by passing
|
||||
@@ -76,13 +104,20 @@ bool DisableTimeFlag() {
|
||||
float time_4sec, time_2sec;
|
||||
hipEvent_t start_event1, end_event1;
|
||||
int clkRate = 0;
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
|
||||
if (IsGfx11()) {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
|
||||
} else {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
}
|
||||
|
||||
HIPCHECK(hipEventCreateWithFlags(&start_event1,
|
||||
hipEventDisableTiming));
|
||||
HIPCHECK(hipEventCreateWithFlags(&end_event1,
|
||||
hipEventDisableTiming));
|
||||
HIPCHECK(hipStreamCreate(&stream1));
|
||||
hipExtLaunchKernelGGL((TwoSecKernel), dim3(1), dim3(1), 0,
|
||||
auto TwoSecKernel_used = IsGfx11() ? TwoSecKernel_gfx11 : TwoSecKernel;
|
||||
hipExtLaunchKernelGGL((TwoSecKernel_used), dim3(1), dim3(1), 0,
|
||||
stream1, start_event1, end_event1, 0, clkRate);
|
||||
HIPCHECK(hipStreamSynchronize(stream1));
|
||||
e = hipEventElapsedTime(&time_2sec, start_event1, end_event1);
|
||||
@@ -113,11 +148,20 @@ bool ConcurencyCheck_GlobalVar(int conc_flag) {
|
||||
int deviceGlobal_h = 0;
|
||||
HIPCHECK(hipSetDevice(0));
|
||||
int clkRate = 0;
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
|
||||
if (IsGfx11()) {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
|
||||
} else {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
}
|
||||
|
||||
HIPCHECK(hipStreamCreate(&stream1));
|
||||
hipExtLaunchKernelGGL((FourSecKernel), dim3(1), dim3(1), 0,
|
||||
auto TwoSecKernel_used = IsGfx11() ? TwoSecKernel_gfx11 : TwoSecKernel;
|
||||
auto FourSecKernel_used = IsGfx11() ? FourSecKernel_gfx11 : FourSecKernel;
|
||||
|
||||
hipExtLaunchKernelGGL((FourSecKernel_used), dim3(1), dim3(1), 0,
|
||||
stream1, nullptr, nullptr, conc_flag, clkRate);
|
||||
hipExtLaunchKernelGGL((TwoSecKernel), dim3(1), dim3(1), 0,
|
||||
hipExtLaunchKernelGGL((TwoSecKernel_used), dim3(1), dim3(1), 0,
|
||||
stream1, nullptr, nullptr, conc_flag, clkRate);
|
||||
HIPCHECK(hipStreamSynchronize(stream1));
|
||||
HIPCHECK(hipMemcpyFromSymbol(&deviceGlobal_h, globalvar,
|
||||
@@ -148,16 +192,24 @@ bool KernelTimeExecution() {
|
||||
hipEvent_t start_event1, end_event1, start_event2, end_event2;
|
||||
float time_4sec, time_2sec;
|
||||
int clkRate = 0;
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
|
||||
if (IsGfx11()) {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
|
||||
} else {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
}
|
||||
|
||||
auto TwoSecKernel_used = IsGfx11() ? TwoSecKernel_gfx11 : TwoSecKernel;
|
||||
auto FourSecKernel_used = IsGfx11() ? FourSecKernel_gfx11 : FourSecKernel;
|
||||
|
||||
HIPCHECK(hipEventCreate(&start_event1));
|
||||
HIPCHECK(hipEventCreate(&end_event1));
|
||||
HIPCHECK(hipEventCreate(&start_event2));
|
||||
HIPCHECK(hipEventCreate(&end_event2));
|
||||
HIPCHECK(hipStreamCreate(&stream1));
|
||||
hipExtLaunchKernelGGL((FourSecKernel), dim3(1), dim3(1), 0,
|
||||
hipExtLaunchKernelGGL((FourSecKernel_used), dim3(1), dim3(1), 0,
|
||||
stream1, start_event1, end_event1, 0, clkRate);
|
||||
hipExtLaunchKernelGGL((TwoSecKernel), dim3(1), dim3(1), 0,
|
||||
hipExtLaunchKernelGGL((TwoSecKernel_used), dim3(1), dim3(1), 0,
|
||||
stream1, start_event2, end_event2, 0, clkRate);
|
||||
HIPCHECK(hipStreamSynchronize(stream1));
|
||||
e = hipEventElapsedTime(&time_4sec, start_event1, end_event1);
|
||||
|
||||
@@ -49,10 +49,7 @@
|
||||
|
||||
#define fileName "matmul.code"
|
||||
#define matmulK "matmulK"
|
||||
#define SixteenSec "SixteenSecKernel"
|
||||
#define KernelandExtra "KernelandExtraParams"
|
||||
#define FourSec "FourSecKernel"
|
||||
#define TwoSec "TwoSecKernel"
|
||||
#define globalDevVar "deviceGlobal"
|
||||
#define dummyKernel "dummyKernel"
|
||||
#define FOURSEC_KERNEL 4999
|
||||
@@ -124,7 +121,12 @@ void ModuleLaunchKernel::AllocateMemory() {
|
||||
HIPCHECK(hipMemcpy(Ad, A, SIZE*sizeof(int), hipMemcpyHostToDevice));
|
||||
HIPCHECK(hipMemcpy(Bd, B, SIZE*sizeof(int), hipMemcpyHostToDevice));
|
||||
int clkRate = 0;
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
if (IsGfx11()) {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
|
||||
} else {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeClockRate, 0));
|
||||
}
|
||||
|
||||
args1._Ad = Ad;
|
||||
args1._Bd = Bd;
|
||||
args1._Cd = C;
|
||||
@@ -149,13 +151,21 @@ void ModuleLaunchKernel::AllocateMemory() {
|
||||
}
|
||||
|
||||
void ModuleLaunchKernel::ModuleLoad() {
|
||||
|
||||
std::string TwoSecStr = IsGfx11() ? std::string("TwoSecKernel_gfx11")
|
||||
: std::string("TwoSecKernel");
|
||||
std::string FourSecStr = IsGfx11() ? std::string("FourSecKernel_gfx11")
|
||||
: std::string("FourSecKernel");
|
||||
std::string SixteenSecStr = IsGfx11() ? std::string("SixteenSecKernel_gfx11")
|
||||
: std::string("SixteenSecKernel");
|
||||
|
||||
HIPCHECK(hipModuleLoad(&Module, fileName));
|
||||
HIPCHECK(hipModuleGetFunction(&MultKernel, Module, matmulK));
|
||||
HIPCHECK(hipModuleGetFunction(&SixteenSecKernel, Module, SixteenSec));
|
||||
HIPCHECK(hipModuleGetFunction(&SixteenSecKernel, Module, SixteenSecStr.c_str()));
|
||||
HIPCHECK(hipModuleGetFunction(&KernelandExtraParamKernel,
|
||||
Module, KernelandExtra));
|
||||
HIPCHECK(hipModuleGetFunction(&FourSecKernel, Module, FourSec));
|
||||
HIPCHECK(hipModuleGetFunction(&TwoSecKernel, Module, TwoSec));
|
||||
HIPCHECK(hipModuleGetFunction(&FourSecKernel, Module, FourSecStr.c_str()));
|
||||
HIPCHECK(hipModuleGetFunction(&TwoSecKernel, Module, TwoSecStr.c_str()));
|
||||
HIPCHECK(hipModuleGetFunction(&DummyKernel, Module, dummyKernel));
|
||||
HIPCHECK(hipModuleGetGlobal(&deviceGlobal, &deviceGlobalSize,
|
||||
Module, globalDevVar));
|
||||
|
||||
@@ -79,5 +79,41 @@ extern "C" __global__ void FourSecKernel(int clockrate) {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void SixteenSecKernel_gfx11(int clockrate) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
uint64_t wait_t = 16000,
|
||||
start = wall_clock64()/clockrate, cur;
|
||||
do { cur = wall_clock64()/clockrate-start;}while (cur < wait_t);
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" __global__ void TwoSecKernel_gfx11(int clockrate) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
if (deviceGlobal == 0x2222) {
|
||||
deviceGlobal = 0x3333;
|
||||
}
|
||||
uint64_t wait_t = 2000,
|
||||
start = wall_clock64()/clockrate, cur;
|
||||
do { cur = wall_clock64()/clockrate-start;}while (cur < wait_t);
|
||||
if (deviceGlobal != 0x3333) {
|
||||
deviceGlobal = 0x5555;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" __global__ void FourSecKernel_gfx11(int clockrate) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
if (deviceGlobal == 1) {
|
||||
deviceGlobal = 0x2222;
|
||||
}
|
||||
uint64_t wait_t = 4000,
|
||||
start = wall_clock64()/clockrate, cur;
|
||||
do { cur = wall_clock64()/clockrate-start;}while (cur < wait_t);
|
||||
if (deviceGlobal == 0x2222) {
|
||||
deviceGlobal = 0x4444;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" __global__ void dummyKernel() {
|
||||
}
|
||||
|
||||
@@ -64,6 +64,25 @@ __global__ void vector_square(float* C_d, float* A_d, size_t Num) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t Num) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (size_t i = gputhread; i < Num; i += stride) {
|
||||
C_d[i] = A_d[i] * A_d[i];
|
||||
}
|
||||
|
||||
// Delay thread 1 only in the GPU
|
||||
if (gputhread == 1) {
|
||||
unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur;
|
||||
do {
|
||||
cur = wall_clock64() - start;
|
||||
} while (cur < wait_t);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
float *A_h, *C_h, *A_h1, *C_h1;
|
||||
|
||||
static void HIPRT_CB Callback_Stream1(hipStream_t stream, hipError_t status,
|
||||
@@ -129,8 +148,9 @@ int main(int argc, char* argv[]) {
|
||||
int *ptr = NULL;
|
||||
int *ptr1 = NULL;
|
||||
// Queing jobs in both mystream1/2 followed by hipStreamAddCallback
|
||||
auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square;
|
||||
for (int i = 1; i < 5; ++i) {
|
||||
hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock),
|
||||
hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock),
|
||||
0, mystream1, C_d, A_d, Num);
|
||||
HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost,
|
||||
mystream1));
|
||||
@@ -139,7 +159,7 @@ int main(int argc, char* argv[]) {
|
||||
HIPCHECK(hipStreamAddCallback(mystream1, Callback_Stream1,
|
||||
reinterpret_cast<void*>(ptr), 0));
|
||||
|
||||
hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock),
|
||||
hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock),
|
||||
0, mystream2, C_d, A_d, Num);
|
||||
HIPCHECK(hipMemcpyAsync(C_h1, C_d, Nbytes,
|
||||
hipMemcpyDeviceToHost, mystream2));
|
||||
|
||||
@@ -64,6 +64,25 @@ __global__ void vector_square(float* C_d, float* A_d, size_t Num) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t Num) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (size_t i = gputhread; i < Num; i += stride) {
|
||||
C_d[i] = A_d[i] * A_d[i];
|
||||
}
|
||||
|
||||
// Delay thread 1 only in the GPU
|
||||
if (gputhread == 1) {
|
||||
unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur;
|
||||
do {
|
||||
cur = wall_clock64() - start;
|
||||
} while (cur < wait_t);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void HIPRT_CB Stream_Callback(hipStream_t stream, hipError_t status,
|
||||
void* userData) {
|
||||
for (size_t i = 0; i < Num; i++) {
|
||||
@@ -100,7 +119,8 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
const unsigned threadsPerBlock = 256;
|
||||
const unsigned blocks = (Num+255)/threadsPerBlock;
|
||||
hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0,
|
||||
auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square;
|
||||
hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0,
|
||||
mystream, C_d, A_d, Num);
|
||||
|
||||
HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
|
||||
|
||||
@@ -64,6 +64,24 @@ __global__ void vector_square(float* C_d, float* A_d, size_t Num) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t Num) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
size_t gputhread = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (size_t i = gputhread; i < Num; i += stride) {
|
||||
C_d[i] = A_d[i] * A_d[i];
|
||||
}
|
||||
|
||||
// Delay thread 1 only in the GPU
|
||||
if (gputhread == 1) {
|
||||
unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur;
|
||||
do {
|
||||
cur = wall_clock64() - start;
|
||||
} while (cur < wait_t);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void HIPRT_CB Thread1_Callback(hipStream_t stream, hipError_t status,
|
||||
void* userData) {
|
||||
@@ -124,7 +142,8 @@ int main(int argc, char* argv[]) {
|
||||
const unsigned threadsPerBlock = 256;
|
||||
const unsigned blocks = (Num+255)/threadsPerBlock;
|
||||
|
||||
hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0,
|
||||
auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square;
|
||||
hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0,
|
||||
mystream, C_d, A_d, Num);
|
||||
|
||||
HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
|
||||
|
||||
@@ -61,6 +61,25 @@ __global__ void vector_square(float* C_d, float* A_d, size_t N_elmts) {
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void vector_square_gfx11(float* C_d, float* A_d, size_t N_elmts) {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for (size_t i = offset; i < N_elmts; i += stride) {
|
||||
C_d[i] = A_d[i] * A_d[i];
|
||||
}
|
||||
|
||||
// Delay the thread 1
|
||||
if (offset == 1) {
|
||||
unsigned long long int wait_t = 3200000000, start = wall_clock64(), cur;
|
||||
do {
|
||||
cur = wall_clock64() - start;
|
||||
} while (cur < wait_t);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
float *A_h, *C_h;
|
||||
|
||||
static void HIPRT_CB Callback1(hipStream_t stream, hipError_t status,
|
||||
@@ -102,7 +121,8 @@ int main(int argc, char* argv[]) {
|
||||
const unsigned threadsPerBlock = 256;
|
||||
const unsigned blocks = (N_elmts + 255)/threadsPerBlock;
|
||||
|
||||
hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0,
|
||||
auto vector_square_used = IsGfx11() ? vector_square_gfx11 : vector_square;
|
||||
hipLaunchKernelGGL((vector_square_used), dim3(blocks), dim3(threadsPerBlock), 0,
|
||||
mystream, C_d, A_d, N_elmts);
|
||||
HIPCHECK(hipMemcpyAsync(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, mystream));
|
||||
HIPCHECK(hipStreamAddCallback(mystream, Callback1, NULL, 0));
|
||||
|
||||
@@ -92,6 +92,32 @@ size_t getHostThreadCount(const size_t memPerThread, const size_t maxThreads) {
|
||||
return thread_count;
|
||||
}
|
||||
|
||||
// Function to determine if the device is of gfx11 architecture
|
||||
bool IsGfx11() {
|
||||
#if defined(__HIP_PLATFORM_NVIDIA__)
|
||||
return false;
|
||||
#elif defined(__HIP_PLATFORM_AMD__)
|
||||
int device = -1;
|
||||
hipDeviceProp_t props{};
|
||||
HIPCHECK(hipGetDevice(&device));
|
||||
HIPCHECK(hipGetDeviceProperties(&props, device));
|
||||
|
||||
// Get GCN Arch Name and compare to check if it is gfx11
|
||||
std::string arch = std::string(props.gcnArchName);
|
||||
auto pos = arch.find(":");
|
||||
if (pos != std::string::npos)
|
||||
arch = arch.substr(0, pos);
|
||||
|
||||
if(arch.size() >= 5)
|
||||
arch = arch.substr(0,5);
|
||||
|
||||
return (arch == std::string("gfx11")) ? true : false;
|
||||
#else
|
||||
std::cout<<"Have to be either Nvidia or AMD platform, asserting"<<std::endl;
|
||||
assert(false);
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace HipTest {
|
||||
|
||||
|
||||
@@ -232,5 +258,4 @@ unsigned setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N)
|
||||
return blocks;
|
||||
}
|
||||
|
||||
|
||||
} // namespace HipTest
|
||||
|
||||
@@ -581,3 +581,6 @@ inline bool isImageSupported() {
|
||||
if (!HipTest::isImageSupported()) \
|
||||
{ printf("Texture is not support on the device. Skipped.\n"); passed(); }
|
||||
#endif //__cplusplus
|
||||
|
||||
// Function to determine if the device is of gfx11 architecture
|
||||
bool IsGfx11();
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren