From bd2be139f6c5fdb98a257807d2fc4c674f06b15e Mon Sep 17 00:00:00 2001 From: ROCm CI Service Account <66695075+rocm-ci@users.noreply.github.com> Date: Sat, 19 Nov 2022 01:44:12 +0530 Subject: [PATCH] SWDEV-348282 - fixed intermittent failure fo hipCGMultiGrid* and hipCGThread* tests, porting over to catch2 in the process (#3067) Change-Id: Id22dfb4ab2ee8171242fbf6a10886ff3e4abc926 [ROCm/hip-tests commit: 74a6e407e755723386ac12ff0c3ab78fccbea70f] --- projects/hip-tests/catch/unit/CMakeLists.txt | 1 + .../catch/unit/cooperativeGrps/CMakeLists.txt | 22 ++ .../hipCGMultiGridGroupType.cc | 240 ++++++++++++++++++ .../hipCGMultiGridGroupTypeViaBaseType.cc | 234 +++++++++++++++++ .../hipCGMultiGridGroupTypeViaPublicApi.cc | 230 +++++++++++++++++ .../cooperativeGrps/hipCGThreadBlockType.cc | 164 ++++++++++++ .../hipCGThreadBlockTypeViaBaseType.cc | 136 ++++++++++ .../hipCGThreadBlockTypeViaPublicApi.cc | 136 ++++++++++ 8 files changed, 1163 insertions(+) create mode 100644 projects/hip-tests/catch/unit/cooperativeGrps/CMakeLists.txt create mode 100644 projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupType.cc create mode 100644 projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupTypeViaBaseType.cc create mode 100644 projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupTypeViaPublicApi.cc create mode 100644 projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockType.cc create mode 100644 projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTypeViaBaseType.cc create mode 100644 projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTypeViaPublicApi.cc diff --git a/projects/hip-tests/catch/unit/CMakeLists.txt b/projects/hip-tests/catch/unit/CMakeLists.txt index 6e97f24663..d856006312 100644 --- a/projects/hip-tests/catch/unit/CMakeLists.txt +++ b/projects/hip-tests/catch/unit/CMakeLists.txt @@ -33,6 +33,7 @@ add_subdirectory(kernel) add_subdirectory(multiThread) add_subdirectory(compiler) add_subdirectory(errorHandling) +add_subdirectory(cooperativeGrps) if(HIP_PLATFORM STREQUAL "amd") add_subdirectory(clock) endif() diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/CMakeLists.txt b/projects/hip-tests/catch/unit/cooperativeGrps/CMakeLists.txt new file mode 100644 index 0000000000..61c567aee4 --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/CMakeLists.txt @@ -0,0 +1,22 @@ +# Common Tests - Test independent of all platforms +set(TEST_SRC + hipCGThreadBlockType.cc + hipCGThreadBlockTypeViaBaseType.cc + hipCGThreadBlockTypeViaPublicApi.cc + hipCGMultiGridGroupType.cc + hipCGMultiGridGroupTypeViaBaseType.cc + hipCGMultiGridGroupTypeViaPublicApi.cc +) +if(HIP_PLATFORM STREQUAL "nvidia") + set_source_files_properties(hipCGMultiGridGroupType.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") + set_source_files_properties(hipCGMultiGridGroupTypeViaBaseType.cc PROPERTIES COMPILE_FLAGS "-D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") + set_source_files_properties(hipCGMultiGridGroupTypeViaPublicApi.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") + hip_add_exe_to_target(NAME coopGrpTest + TEST_SRC ${TEST_SRC} + TEST_TARGET_NAME build_tests + LINKER_LIBS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") +else() + hip_add_exe_to_target(NAME coopGrpTest + TEST_SRC ${TEST_SRC} + TEST_TARGET_NAME build_tests) +endif() diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupType.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupType.cc new file mode 100644 index 0000000000..1dd2a8f3b5 --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupType.cc @@ -0,0 +1,240 @@ +/* +Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 + * TEST: %t + * HIT_END + */ + +#include +#include + +#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs) +#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs) +#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs) + +using namespace cooperative_groups; +constexpr int MaxGPUs = 8; + +static __global__ +void kernel_cg_multi_grid_group_type(int* numGridsTestD, + int* gridRankTestD, + int *sizeTestD, + int *thdRankTestD, + int *isValidTestD, + int *syncTestD, + int *syncResultD) +{ + multi_grid_group mg = this_multi_grid(); + int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; + + // Test num_grids + numGridsTestD[gIdx] = mg.num_grids(); + + // Test grid_rank + gridRankTestD[gIdx] = mg.grid_rank(); + + // Test size + sizeTestD[gIdx] = mg.size(); + + // Test thread_rank + thdRankTestD[gIdx] = mg.thread_rank(); + + // Test is_valid + isValidTestD[gIdx] = mg.is_valid(); + + // Test sync + // + // Eech thread assign 1 to their respective location + syncTestD[gIdx] = 1; + // Grid level sync + this_grid().sync(); + // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction + if (blockIdx.x == 0 && threadIdx.x == 0) { + for (uint i = 1; i < gridDim.x * blockDim.x; ++i) { + syncTestD[0] += syncTestD[i]; + } + syncResultD[mg.grid_rank() + 1] = syncTestD[0]; + } + // multi-grid level sync + mg.sync(); + // grid (gpu) 0 does final reduction across all grids (gpus) + if (mg.grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) { + syncResultD[0] = 0; + for (uint i = 1; i <= mg.num_grids(); ++i) { + syncResultD[0] += syncResultD[i]; + } + } +} + +static void test_cg_multi_grid_group_type(int blockSize, int nGpu) +{ + // Create a stream each device + hipStream_t stream[MaxGPUs]; + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device + HIPCHECK(hipStreamCreate(&stream[i])); + } + + // Allocate host and device memory + int nBytes = sizeof(int) * 2 * blockSize; + int *numGridsTestD[MaxGPUs], *numGridsTestH[MaxGPUs]; + int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs]; + int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs]; + int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs]; + int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs]; + int *syncTestD[MaxGPUs], *syncResultD; + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + + HIPCHECK(hipMalloc(&numGridsTestD[i], nBytes)); + HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes)); + HIPCHECK(hipMalloc(&sizeTestD[i], nBytes)); + HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes)); + HIPCHECK(hipMalloc(&isValidTestD[i], nBytes)); + HIPCHECK(hipMalloc(&syncTestD[i], nBytes)); + + HIPCHECK(hipHostMalloc(&numGridsTestH[i], nBytes)); + HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes)); + HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes)); + HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes)); + HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes)); + + if (i == 0) { + HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent)); + } + } + + // Launch Kernel + constexpr int NumKernelArgs = 7; + hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu]; + void* args[MaxGPUs * NumKernelArgs]; + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + + args[i * NumKernelArgs] = &numGridsTestD[i]; + args[i * NumKernelArgs + 1] = &gridRankTestD[i]; + args[i * NumKernelArgs + 2] = &sizeTestD[i]; + args[i * NumKernelArgs + 3] = &thdRankTestD[i]; + args[i * NumKernelArgs + 4] = &isValidTestD[i]; + args[i * NumKernelArgs + 5] = &syncTestD[i]; + args[i * NumKernelArgs + 6] = &syncResultD; + + launchParamsList[i].func = reinterpret_cast(kernel_cg_multi_grid_group_type); + launchParamsList[i].gridDim = 2; + launchParamsList[i].blockDim = blockSize; + launchParamsList[i].sharedMem = 0; + launchParamsList[i].stream = stream[i]; + launchParamsList[i].args = &args[i * NumKernelArgs]; + } + HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0)); + + // Copy result from device to host + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + HIPCHECK(hipMemcpy(numGridsTestH[i], numGridsTestD[i], nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost)); + } + + // Validate results + int gridsSeen[MaxGPUs]; + for (int i = 0; i < nGpu; ++i) { + for (int j = 0; j < 2 * blockSize; ++j) { + ASSERT_EQUAL(numGridsTestH[i][j], nGpu); + ASSERT_GE(gridRankTestH[i][j], 0); + ASSERT_LE(gridRankTestH[i][j], nGpu-1); + ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]); + ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize); + int gridRank = gridRankTestH[i][j]; + ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j); + ASSERT_EQUAL(isValidTestH[i][j], 1); + } + ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize); + + // Validate uniqueness property of grid rank + gridsSeen[i] = gridRankTestH[i][0]; + for (int k = 0; k < i; ++k) { + if (gridsSeen[k] == gridsSeen[i]) { + assert(false && "Grid rank in multi-gpu setup should be unique"); + } + } + } + ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize); + + // Free host and device memory + delete [] launchParamsList; + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + + HIPCHECK(hipFree(numGridsTestD[i])); + HIPCHECK(hipFree(gridRankTestD[i])); + HIPCHECK(hipFree(sizeTestD[i])); + HIPCHECK(hipFree(thdRankTestD[i])); + HIPCHECK(hipFree(isValidTestD[i])); + HIPCHECK(hipFree(syncTestD[i])); + + if (i == 0) { + HIPCHECK(hipHostFree(syncResultD)); + } + HIPCHECK(hipHostFree(numGridsTestH[i])); + HIPCHECK(hipHostFree(gridRankTestH[i])); + HIPCHECK(hipHostFree(sizeTestH[i])); + HIPCHECK(hipHostFree(thdRankTestH[i])); + HIPCHECK(hipHostFree(isValidTestH[i])); + } +} + +TEST_CASE("Unit_hipCGMultiGridGroupType") { + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + nGpu = min(nGpu, MaxGPUs); + + // Set `maxThreadsPerBlock` by taking minimum among all available devices + int maxThreadsPerBlock = INT_MAX; + hipDeviceProp_t deviceProperties; + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipGetDeviceProperties(&deviceProperties, i)); + if (!deviceProperties.cooperativeMultiDeviceLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock); + } + + // Test for blockSizes in powers of 2 + for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) { + test_cg_multi_grid_group_type(blockSize, nGpu); + } + + // Test for random blockSizes, but the sequence is the same every execution + srand(0); + for (int i = 0; i < 10; i++) { + // Test fails for 0 thread per block + test_cg_multi_grid_group_type(max(2, rand() % maxThreadsPerBlock), nGpu); + } +} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupTypeViaBaseType.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupTypeViaBaseType.cc new file mode 100644 index 0000000000..408f3b0075 --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupTypeViaBaseType.cc @@ -0,0 +1,234 @@ +/* +Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 + * TEST: %t + * HIT_END + */ + +#include +#include +#include +#include +#include + +#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs) +#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs) +#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs) + +using namespace cooperative_groups; +constexpr int MaxGPUs = 8; + +static __global__ +void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD, + int* gridRankTestD, + int *thdRankTestD, + int *isValidTestD, + int *syncTestD, + int *syncResultD) +{ + thread_group tg = this_multi_grid(); // This can work if _CG_ABI_EXPERIMENTAL defined on Cuda + + int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; + + // Test size + sizeTestD[gIdx] = tg.size(); + + // Test thread_rank + gridRankTestD[gIdx] = this_multi_grid().grid_rank(); + thdRankTestD[gIdx] = tg.thread_rank(); + + // Test is_valid +#ifdef __HIP_PLATFORM_AMD__ + isValidTestD[gIdx] = tg.is_valid(); +#else + // Cuda has no thread_group.is_valid() + isValidTestD[gIdx] = true; +#endif + // Test sync + // + // Eech thread assign 1 to their respective location + syncTestD[gIdx] = 1; + // Grid level sync + this_grid().sync(); + // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction + if (blockIdx.x == 0 && threadIdx.x == 0) { + for (uint i = 1; i < gridDim.x * blockDim.x; ++i) { + syncTestD[0] += syncTestD[i]; + } + syncResultD[this_multi_grid().grid_rank() + 1] = syncTestD[0]; + } + // multi-grid level sync + tg.sync(); + // grid (gpu) 0 does final reduction across all grids (gpus) + if (this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) { + syncResultD[0] = 0; + for (uint i = 1; i <= this_multi_grid().num_grids(); ++i) { + syncResultD[0] += syncResultD[i]; + } + } +} + +static void test_cg_multi_grid_group_type_via_base_type(int blockSize, int nGpu) +{ + // Create a stream each device + hipStream_t stream[MaxGPUs]; + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device + HIPCHECK(hipStreamCreate(&stream[i])); + } + + // Allocate host and device memory + int nBytes = sizeof(int) * 2 * blockSize; + int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs]; + int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs]; + int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs]; + int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs]; + int *syncTestD[MaxGPUs], *syncResultD; + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + + HIPCHECK(hipMalloc(&sizeTestD[i], nBytes)); + HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes)); + HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes)); + HIPCHECK(hipMalloc(&isValidTestD[i], nBytes)); + HIPCHECK(hipMalloc(&syncTestD[i], nBytes)); + + HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes)); + HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes)); + HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes)); + HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes)); + + if (i == 0) { + HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent)); + } + } + + // Launch Kernel + constexpr int NumKernelArgs = 6; + hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu]; + void* args[MaxGPUs * NumKernelArgs]; + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + + args[i * NumKernelArgs ] = &sizeTestD[i]; + args[i * NumKernelArgs + 1] = &gridRankTestD[i]; + args[i * NumKernelArgs + 2] = &thdRankTestD[i]; + args[i * NumKernelArgs + 3] = &isValidTestD[i]; + args[i * NumKernelArgs + 4] = &syncTestD[i]; + args[i * NumKernelArgs + 5] = &syncResultD; + + launchParamsList[i].func = reinterpret_cast(kernel_cg_multi_grid_group_type_via_base_type); + launchParamsList[i].gridDim = 2; + launchParamsList[i].blockDim = blockSize; + launchParamsList[i].sharedMem = 0; + launchParamsList[i].stream = stream[i]; + launchParamsList[i].args = &args[i * NumKernelArgs]; + } + HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0)); + + // Copy result from device to host + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost)); + } + + // Validate results + int gridsSeen[MaxGPUs]; + for (int i = 0; i < nGpu; ++i) { + for (int j = 0; j < 2 * blockSize; ++j) { + ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize); + ASSERT_GE(gridRankTestH[i][j], 0); + ASSERT_LE(gridRankTestH[i][j], nGpu-1); + ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]); + int gridRank = gridRankTestH[i][j]; + ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j); + ASSERT_EQUAL(isValidTestH[i][j], 1); + } + ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize); + + // Validate uniqueness property of grid rank + gridsSeen[i] = gridRankTestH[i][0]; + for (int k = 0; k < i; ++k) { + if (gridsSeen[k] == gridsSeen[i]) { + assert (false && "Grid rank in multi-gpu setup should be unique"); + } + } + } + ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize); + + // Free host and device memory + delete [] launchParamsList; + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + + HIPCHECK(hipFree(sizeTestD[i])); + HIPCHECK(hipFree(gridRankTestD[i])); + HIPCHECK(hipFree(thdRankTestD[i])); + HIPCHECK(hipFree(isValidTestD[i])); + HIPCHECK(hipFree(syncTestD[i])); + + if (i == 0) + HIPCHECK(hipHostFree(syncResultD)); + + HIPCHECK(hipHostFree(sizeTestH[i])); + HIPCHECK(hipHostFree(gridRankTestH[i])); + HIPCHECK(hipHostFree(thdRankTestH[i])); + HIPCHECK(hipHostFree(isValidTestH[i])); + } +} + +TEST_CASE("Unit_hipCGMultiGridGroupType_BaseType") { + // Set `maxThreadsPerBlock` by taking minimum among all available devices + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + nGpu = min(nGpu, MaxGPUs); + + int maxThreadsPerBlock = INT_MAX; + hipDeviceProp_t deviceProperties; + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipGetDeviceProperties(&deviceProperties, i)); + if (!deviceProperties.cooperativeMultiDeviceLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock); + } + + // Test for blockSizes in powers of 2 + for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) { + test_cg_multi_grid_group_type_via_base_type(blockSize, nGpu); + } + + // Test for random blockSizes, but the sequence is the same every execution + srand(0); + for (int i = 0; i < 10; i++) { + // Test fails for 0 thread per block + test_cg_multi_grid_group_type_via_base_type(max(2, rand() % maxThreadsPerBlock), nGpu); + } +} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupTypeViaPublicApi.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupTypeViaPublicApi.cc new file mode 100644 index 0000000000..3e5b97fe5a --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupTypeViaPublicApi.cc @@ -0,0 +1,230 @@ +/* +Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 + * TEST: %t + * HIT_END + */ + +#include +#include +#include +#include +#include + +#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs) +#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs) +#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs) + +using namespace cooperative_groups; +constexpr int MaxGPUs = 8; + +static __global__ +void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD, + int* gridRankTestD, + int *thdRankTestD, + int *isValidTestD, + int *syncTestD, + int *syncResultD) +{ + multi_grid_group mg = this_multi_grid(); + int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; + + // Test group_size api + sizeTestD[gIdx] = group_size(mg); + + // Test thread_rank api + gridRankTestD[gIdx] = this_multi_grid().grid_rank(); + thdRankTestD[gIdx] = thread_rank(mg); + + // Test is_valid api + isValidTestD[gIdx] = mg.is_valid(); + + // Test sync api + // + // Eech thread assign 1 to their respective location + syncTestD[gIdx] = 1; + // Grid level sync + sync(this_grid()); + // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction + if (blockIdx.x == 0 && threadIdx.x == 0) { + for (uint i = 1; i < gridDim.x * blockDim.x; ++i) { + syncTestD[0] += syncTestD[i]; + } + syncResultD[this_multi_grid().grid_rank() + 1] = syncTestD[0]; + } + // multi-grid level sync via public api + sync(mg); + // grid (gpu) 0 does final reduction across all grids (gpus) + if (this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) { + syncResultD[0] = 0; + for (uint i = 1; i <= this_multi_grid().num_grids(); ++i) { + syncResultD[0] += syncResultD[i]; + } + } +} + +static void test_cg_multi_grid_group_type_via_public_api(int blockSize, int nGpu) +{ + // Create a stream each device + hipStream_t stream[MaxGPUs]; + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device + HIPCHECK(hipStreamCreate(&stream[i])); + } + + // Allocate host and device memory + int nBytes = sizeof(int) * 2 * blockSize; + int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs]; + int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs]; + int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs]; + int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs]; + int *syncTestD[MaxGPUs], *syncResultD; + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + + HIPCHECK(hipMalloc(&sizeTestD[i], nBytes)); + HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes)); + HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes)); + HIPCHECK(hipMalloc(&isValidTestD[i], nBytes)); + HIPCHECK(hipMalloc(&syncTestD[i], nBytes)); + + HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes)); + HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes)); + HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes)); + HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes)); + + if (i == 0) { + HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent)); + } + } + + // Launch Kernel + constexpr int NumKernelArgs = 6; + hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu]; + void* args[MaxGPUs * NumKernelArgs]; + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + + args[i * NumKernelArgs ] = &sizeTestD[i]; + args[i * NumKernelArgs + 1] = &gridRankTestD[i]; + args[i * NumKernelArgs + 2] = &thdRankTestD[i]; + args[i * NumKernelArgs + 3] = &isValidTestD[i]; + args[i * NumKernelArgs + 4] = &syncTestD[i]; + args[i * NumKernelArgs + 5] = &syncResultD; + + launchParamsList[i].func = reinterpret_cast(kernel_cg_multi_grid_group_type_via_public_api); + launchParamsList[i].gridDim = 2; + launchParamsList[i].blockDim = blockSize; + launchParamsList[i].sharedMem = 0; + launchParamsList[i].stream = stream[i]; + launchParamsList[i].args = &args[i * NumKernelArgs]; + } + HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0)); + + // Copy result from device to host + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + + HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost)); + } + + // Validate results + int gridsSeen[MaxGPUs]; + for (int i = 0; i < nGpu; ++i) { + for (int j = 0; j < 2 * blockSize; ++j) { + ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize); + ASSERT_GE(gridRankTestH[i][j], 0); + ASSERT_LE(gridRankTestH[i][j], nGpu-1); + ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]); + int gridRank = gridRankTestH[i][j]; + ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j); + ASSERT_EQUAL(isValidTestH[i][j], 1); + } + ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize); + + // Validate uniqueness property of grid rank + gridsSeen[i] = gridRankTestH[i][0]; + for (int k = 0; k < i; ++k) { + if (gridsSeen[k] == gridsSeen[i]) { + assert (false && "Grid rank in multi-gpu setup should be unique"); + } + } + } + ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize); + + // Free host and device memory + delete [] launchParamsList; + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipSetDevice(i)); + + HIPCHECK(hipFree(sizeTestD[i])); + HIPCHECK(hipFree(gridRankTestD[i])); + HIPCHECK(hipFree(thdRankTestD[i])); + HIPCHECK(hipFree(isValidTestD[i])); + HIPCHECK(hipFree(syncTestD[i])); + + if (i == 0) + HIPCHECK(hipHostFree(syncResultD)); + + HIPCHECK(hipHostFree(sizeTestH[i])); + HIPCHECK(hipHostFree(gridRankTestH[i])); + HIPCHECK(hipHostFree(thdRankTestH[i])); + HIPCHECK(hipHostFree(isValidTestH[i])); + } +} + +TEST_CASE("Unit_hipCGMultiGridGroupType_PublicApi") { + // Set `maxThreadsPerBlock` by taking minimum among all available devices + int nGpu = 0; + HIPCHECK(hipGetDeviceCount(&nGpu)); + nGpu = min(nGpu, MaxGPUs); + + int maxThreadsPerBlock = INT_MAX; + hipDeviceProp_t deviceProperties; + for (int i = 0; i < nGpu; i++) { + HIPCHECK(hipGetDeviceProperties(&deviceProperties, i)); + if (!deviceProperties.cooperativeMultiDeviceLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock); + } + + // Test for blockSizes in powers of 2 + for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) { + test_cg_multi_grid_group_type_via_public_api(blockSize, nGpu); + } + + // Test for random blockSizes, but the sequence is the same every execution + srand(0); + for (int i = 0; i < 10; i++) { + // Test fails for 0 thread per block + test_cg_multi_grid_group_type_via_public_api(max(2, rand() % maxThreadsPerBlock), nGpu); + } +} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockType.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockType.cc new file mode 100644 index 0000000000..bc4d671da0 --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockType.cc @@ -0,0 +1,164 @@ +/* +Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * TEST: %t + * HIT_END + */ + +#include +#include +#include + +#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs) + +using namespace cooperative_groups; + +static __global__ +void kernel_cg_thread_block_type(int *sizeTestD, + int *thdRankTestD, + int *syncTestD, + dim3 *groupIndexTestD, + dim3 *thdIndexTestD) +{ + thread_block tb = this_thread_block(); + int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; + // Test size + sizeTestD[gIdx] = tb.size(); + + // Test thread_rank + thdRankTestD[gIdx] = tb.thread_rank(); + + // Test sync + __shared__ int sm[2]; + if (threadIdx.x == 0) + sm[0] = 10; + else if (threadIdx.x == 1) + sm[1] = 20; + tb.sync(); + syncTestD[gIdx] = sm[1] * sm[0]; + + // Test group_index + groupIndexTestD[gIdx] = tb.group_index(); + + // Test thread_index + thdIndexTestD[gIdx] = tb.thread_index(); +} + +static void test_cg_thread_block_type(int blockSize) +{ + int nBytes = sizeof(int) * 2 * blockSize; + int nDim3Bytes = sizeof(dim3) * 2 * blockSize; + int *sizeTestD, *sizeTestH; + int *thdRankTestD, *thdRankTestH; + int *syncTestD, *syncTestH; + dim3 *groupIndexTestD, *groupIndexTestH; + dim3 *thdIndexTestD, *thdIndexTestH; + + // Allocate device memory + HIPCHECK(hipMalloc(&sizeTestD, nBytes)); + HIPCHECK(hipMalloc(&thdRankTestD, nBytes)); + HIPCHECK(hipMalloc(&syncTestD, nBytes)); + HIPCHECK(hipMalloc(&groupIndexTestD, nDim3Bytes)); + HIPCHECK(hipMalloc(&thdIndexTestD, nDim3Bytes)); + + // Allocate host memory + HIPCHECK(hipHostMalloc(&sizeTestH, nBytes)); + HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes)); + HIPCHECK(hipHostMalloc(&syncTestH, nBytes)); + HIPCHECK(hipHostMalloc(&groupIndexTestH, nDim3Bytes)); + HIPCHECK(hipHostMalloc(&thdIndexTestH, nDim3Bytes)); + + // Launch Kernel + hipLaunchKernelGGL(kernel_cg_thread_block_type, + 2, + blockSize, + 0, + 0, + sizeTestD, + thdRankTestD, + syncTestD, + groupIndexTestD, + thdIndexTestD); + + // Copy result from device to host + HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(groupIndexTestH, groupIndexTestD, nDim3Bytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(thdIndexTestH, thdIndexTestD, nDim3Bytes, hipMemcpyDeviceToHost)); + + // Validate results for both blocks together + for (int i = 0; i < 2 * blockSize; ++i) { + ASSERT_EQUAL(sizeTestH[i], blockSize); + ASSERT_EQUAL(thdRankTestH[i], i % blockSize); + ASSERT_EQUAL(syncTestH[i], 200); + ASSERT_EQUAL(groupIndexTestH[i].x, (uint) i / blockSize); + ASSERT_EQUAL(groupIndexTestH[i].y, 0); + ASSERT_EQUAL(groupIndexTestH[i].z, 0); + ASSERT_EQUAL(thdIndexTestH[i].x, (uint) i % blockSize); + ASSERT_EQUAL(thdIndexTestH[i].y, 0); + ASSERT_EQUAL(thdIndexTestH[i].z, 0); + } + + // Free device memory + HIPCHECK(hipFree(sizeTestD)); + HIPCHECK(hipFree(thdRankTestD)); + HIPCHECK(hipFree(syncTestD)); + HIPCHECK(hipFree(groupIndexTestD)); + HIPCHECK(hipFree(thdIndexTestD)); + + //Free host memory + HIPCHECK(hipHostFree(sizeTestH)); + HIPCHECK(hipHostFree(thdRankTestH)); + HIPCHECK(hipHostFree(syncTestH)); + HIPCHECK(hipHostFree(groupIndexTestH)); + HIPCHECK(hipHostFree(thdIndexTestH)); +} + +TEST_CASE("Unit_hipCGThreadBlockType") { + // Use default device for validating the test + int deviceId; + hipDeviceProp_t deviceProperties; + HIPCHECK(hipGetDevice(&deviceId)); + HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId)); + + if (!deviceProperties.cooperativeLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + + // Test for blockSizes in powers of 2 + int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; + for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) { + test_cg_thread_block_type(blockSize); + } + + // Test for random blockSizes, but the sequence is the same every execution + srand(0); + for (int i = 0; i < 10; i++) { + // Test fails for only 1 thread per block + test_cg_thread_block_type(max(2, rand() % maxThreadsPerBlock)); + } +} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTypeViaBaseType.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTypeViaBaseType.cc new file mode 100644 index 0000000000..69f5e91ad2 --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTypeViaBaseType.cc @@ -0,0 +1,136 @@ +/* +Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * TEST: %t + * HIT_END + */ + +#include +#include "hip/hip_cooperative_groups.h" +#include + +#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs) + +using namespace cooperative_groups; + +static __global__ +void kernel_cg_thread_block_type_via_base_type(int *sizeTestD, + int *thdRankTestD, + int *syncTestD) +{ + thread_group tg = this_thread_block(); + int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; + + // Test size + sizeTestD[gIdx] = tg.size(); + + // Test thread_rank + thdRankTestD[gIdx] = tg.thread_rank(); + + // Test sync + __shared__ int sm[2]; + if (threadIdx.x == 0) + sm[0] = 10; + else if (threadIdx.x == 1) + sm[1] = 20; + tg.sync(); + syncTestD[gIdx] = sm[1] * sm[0]; +} + +static void test_cg_thread_block_type_via_base_type(int blockSize) +{ + int nBytes = sizeof(int) * 2 * blockSize; + int *sizeTestD, *sizeTestH; + int *thdRankTestD, *thdRankTestH; + int *syncTestD, *syncTestH; + + // Allocate device memory + HIPCHECK(hipMalloc(&sizeTestD, nBytes)); + HIPCHECK(hipMalloc(&thdRankTestD, nBytes)); + HIPCHECK(hipMalloc(&syncTestD, nBytes)); + + // Allocate host memory + HIPCHECK(hipHostMalloc(&sizeTestH, nBytes)); + HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes)); + HIPCHECK(hipHostMalloc(&syncTestH, nBytes)); + + // Launch Kernel + hipLaunchKernelGGL(kernel_cg_thread_block_type_via_base_type, + 2, + blockSize, + 0, + 0, + sizeTestD, + thdRankTestD, + syncTestD); + + // Copy result from device to host + HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost)); + + // Validate results for both blocks together + for (int i = 0; i < 2 * blockSize; ++i) { + ASSERT_EQUAL(sizeTestH[i], blockSize); + ASSERT_EQUAL(thdRankTestH[i], i % blockSize); + ASSERT_EQUAL(syncTestH[i], 200); + } + + // Free device memory + HIPCHECK(hipFree(sizeTestD)); + HIPCHECK(hipFree(thdRankTestD)); + HIPCHECK(hipFree(syncTestD)); + + //Free host memory + HIPCHECK(hipHostFree(sizeTestH)); + HIPCHECK(hipHostFree(thdRankTestH)); + HIPCHECK(hipHostFree(syncTestH)); +} + +TEST_CASE("Unit_hipCGThreadBlockType_BaseType") { + // Use default device for validating the test + int deviceId; + hipDeviceProp_t deviceProperties; + HIPCHECK(hipGetDevice(&deviceId)); + HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId)); + + if (!deviceProperties.cooperativeLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + + // Test for blockSizes in powers of 2 + int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; + for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) { + test_cg_thread_block_type_via_base_type(blockSize); + } + + // Test for random blockSizes, but the sequence is the same every execution + srand(0); + for (int i = 0; i < 10; i++) { + // Test fails for only 1 thread per block + test_cg_thread_block_type_via_base_type(max(2, rand() % maxThreadsPerBlock)); + } +} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTypeViaPublicApi.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTypeViaPublicApi.cc new file mode 100644 index 0000000000..f4913ad2c7 --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTypeViaPublicApi.cc @@ -0,0 +1,136 @@ +/* +Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * TEST: %t + * HIT_END + */ + +#include +#include "hip/hip_cooperative_groups.h" +#include + +#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs) + +using namespace cooperative_groups; + +static __global__ +void kernel_cg_thread_block_type_via_public_api(int *sizeTestD, + int *thdRankTestD, + int *syncTestD) +{ + thread_block tb = this_thread_block(); + int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; + + // Test group_size api + sizeTestD[gIdx] = group_size(tb); + + // Test thread_rank api + thdRankTestD[gIdx] = thread_rank(tb); + + // Test sync api + __shared__ int sm[2]; + if (threadIdx.x == 0) + sm[0] = 10; + else if (threadIdx.x == 1) + sm[1] = 20; + sync(tb); + syncTestD[gIdx] = sm[1] * sm[0]; +} + +static void test_cg_thread_block_type_via_public_api(int blockSize) +{ + int nBytes = sizeof(int) * 2 * blockSize; + int *sizeTestD, *sizeTestH; + int *thdRankTestD, *thdRankTestH; + int *syncTestD, *syncTestH; + + // Allocate device memory + HIPCHECK(hipMalloc(&sizeTestD, nBytes)); + HIPCHECK(hipMalloc(&thdRankTestD, nBytes)); + HIPCHECK(hipMalloc(&syncTestD, nBytes)); + + // Allocate host memory + HIPCHECK(hipHostMalloc(&sizeTestH, nBytes)); + HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes)); + HIPCHECK(hipHostMalloc(&syncTestH, nBytes)); + + // Launch Kernel + hipLaunchKernelGGL(kernel_cg_thread_block_type_via_public_api, + 2, + blockSize, + 0, + 0, + sizeTestD, + thdRankTestD, + syncTestD); + + // Copy result from device to host + HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost)); + + // Validate results for both blocks together + for (int i = 0; i < 2 * blockSize; ++i) { + ASSERT_EQUAL(sizeTestH[i], blockSize); + ASSERT_EQUAL(thdRankTestH[i], i % blockSize); + ASSERT_EQUAL(syncTestH[i], 200); + } + + // Free device memory + HIPCHECK(hipFree(sizeTestD)); + HIPCHECK(hipFree(thdRankTestD)); + HIPCHECK(hipFree(syncTestD)); + + //Free host memory + HIPCHECK(hipHostFree(sizeTestH)); + HIPCHECK(hipHostFree(thdRankTestH)); + HIPCHECK(hipHostFree(syncTestH)); +} + +TEST_CASE("Unit_hipCGThreadBlockType_PublicApi") { + // Use default device for validating the test + int deviceId; + hipDeviceProp_t deviceProperties; + HIPCHECK(hipGetDevice(&deviceId)); + HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId)); + + if (!deviceProperties.cooperativeLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + + // Test for blockSizes in powers of 2 + int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; + for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) { + test_cg_thread_block_type_via_public_api(blockSize); + } + + // Test for random blockSizes, but the sequence is the same every execution + srand(0); + for (int i = 0; i < 10; i++) { + // Test fails for only 1 thread per block + test_cg_thread_block_type_via_public_api(max(2, rand() % maxThreadsPerBlock)); + } +}