SWDEV-348282 - fixed intermittent failure fo hipCGMultiGrid* and hipCGThread* tests, porting over to catch2 in the process (#3067)

Change-Id: Id22dfb4ab2ee8171242fbf6a10886ff3e4abc926


[ROCm/hip-tests commit: 74a6e407e7]
このコミットが含まれているのは:
ROCm CI Service Account
2022-11-19 01:44:12 +05:30
committed by GitHub
コミット bd2be139f6
8個のファイルの変更1163行の追加0行の削除
+1
ファイルの表示
@@ -33,6 +33,7 @@ add_subdirectory(kernel)
add_subdirectory(multiThread)
add_subdirectory(compiler)
add_subdirectory(errorHandling)
add_subdirectory(cooperativeGrps)
if(HIP_PLATFORM STREQUAL "amd")
add_subdirectory(clock)
endif()
+22
ファイルの表示
@@ -0,0 +1,22 @@
# Common Tests - Test independent of all platforms
set(TEST_SRC
hipCGThreadBlockType.cc
hipCGThreadBlockTypeViaBaseType.cc
hipCGThreadBlockTypeViaPublicApi.cc
hipCGMultiGridGroupType.cc
hipCGMultiGridGroupTypeViaBaseType.cc
hipCGMultiGridGroupTypeViaPublicApi.cc
)
if(HIP_PLATFORM STREQUAL "nvidia")
set_source_files_properties(hipCGMultiGridGroupType.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
set_source_files_properties(hipCGMultiGridGroupTypeViaBaseType.cc PROPERTIES COMPILE_FLAGS "-D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
set_source_files_properties(hipCGMultiGridGroupTypeViaPublicApi.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
hip_add_exe_to_target(NAME coopGrpTest
TEST_SRC ${TEST_SRC}
TEST_TARGET_NAME build_tests
LINKER_LIBS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
else()
hip_add_exe_to_target(NAME coopGrpTest
TEST_SRC ${TEST_SRC}
TEST_TARGET_NAME build_tests)
endif()
+240
ファイルの表示
@@ -0,0 +1,240 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
* TEST: %t
* HIT_END
*/
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
using namespace cooperative_groups;
constexpr int MaxGPUs = 8;
static __global__
void kernel_cg_multi_grid_group_type(int* numGridsTestD,
int* gridRankTestD,
int *sizeTestD,
int *thdRankTestD,
int *isValidTestD,
int *syncTestD,
int *syncResultD)
{
multi_grid_group mg = this_multi_grid();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test num_grids
numGridsTestD[gIdx] = mg.num_grids();
// Test grid_rank
gridRankTestD[gIdx] = mg.grid_rank();
// Test size
sizeTestD[gIdx] = mg.size();
// Test thread_rank
thdRankTestD[gIdx] = mg.thread_rank();
// Test is_valid
isValidTestD[gIdx] = mg.is_valid();
// Test sync
//
// Eech thread assign 1 to their respective location
syncTestD[gIdx] = 1;
// Grid level sync
this_grid().sync();
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
syncTestD[0] += syncTestD[i];
}
syncResultD[mg.grid_rank() + 1] = syncTestD[0];
}
// multi-grid level sync
mg.sync();
// grid (gpu) 0 does final reduction across all grids (gpus)
if (mg.grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
syncResultD[0] = 0;
for (uint i = 1; i <= mg.num_grids(); ++i) {
syncResultD[0] += syncResultD[i];
}
}
}
static void test_cg_multi_grid_group_type(int blockSize, int nGpu)
{
// Create a stream each device
hipStream_t stream[MaxGPUs];
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device
HIPCHECK(hipStreamCreate(&stream[i]));
}
// Allocate host and device memory
int nBytes = sizeof(int) * 2 * blockSize;
int *numGridsTestD[MaxGPUs], *numGridsTestH[MaxGPUs];
int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
int *syncTestD[MaxGPUs], *syncResultD;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMalloc(&numGridsTestD[i], nBytes));
HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
HIPCHECK(hipHostMalloc(&numGridsTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
if (i == 0) {
HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
}
}
// Launch Kernel
constexpr int NumKernelArgs = 7;
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
void* args[MaxGPUs * NumKernelArgs];
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
args[i * NumKernelArgs] = &numGridsTestD[i];
args[i * NumKernelArgs + 1] = &gridRankTestD[i];
args[i * NumKernelArgs + 2] = &sizeTestD[i];
args[i * NumKernelArgs + 3] = &thdRankTestD[i];
args[i * NumKernelArgs + 4] = &isValidTestD[i];
args[i * NumKernelArgs + 5] = &syncTestD[i];
args[i * NumKernelArgs + 6] = &syncResultD;
launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type);
launchParamsList[i].gridDim = 2;
launchParamsList[i].blockDim = blockSize;
launchParamsList[i].sharedMem = 0;
launchParamsList[i].stream = stream[i];
launchParamsList[i].args = &args[i * NumKernelArgs];
}
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
// Copy result from device to host
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMemcpy(numGridsTestH[i], numGridsTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
}
// Validate results
int gridsSeen[MaxGPUs];
for (int i = 0; i < nGpu; ++i) {
for (int j = 0; j < 2 * blockSize; ++j) {
ASSERT_EQUAL(numGridsTestH[i][j], nGpu);
ASSERT_GE(gridRankTestH[i][j], 0);
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
int gridRank = gridRankTestH[i][j];
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
ASSERT_EQUAL(isValidTestH[i][j], 1);
}
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
// Validate uniqueness property of grid rank
gridsSeen[i] = gridRankTestH[i][0];
for (int k = 0; k < i; ++k) {
if (gridsSeen[k] == gridsSeen[i]) {
assert(false && "Grid rank in multi-gpu setup should be unique");
}
}
}
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
// Free host and device memory
delete [] launchParamsList;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipFree(numGridsTestD[i]));
HIPCHECK(hipFree(gridRankTestD[i]));
HIPCHECK(hipFree(sizeTestD[i]));
HIPCHECK(hipFree(thdRankTestD[i]));
HIPCHECK(hipFree(isValidTestD[i]));
HIPCHECK(hipFree(syncTestD[i]));
if (i == 0) {
HIPCHECK(hipHostFree(syncResultD));
}
HIPCHECK(hipHostFree(numGridsTestH[i]));
HIPCHECK(hipHostFree(gridRankTestH[i]));
HIPCHECK(hipHostFree(sizeTestH[i]));
HIPCHECK(hipHostFree(thdRankTestH[i]));
HIPCHECK(hipHostFree(isValidTestH[i]));
}
}
TEST_CASE("Unit_hipCGMultiGridGroupType") {
int nGpu = 0;
HIPCHECK(hipGetDeviceCount(&nGpu));
nGpu = min(nGpu, MaxGPUs);
// Set `maxThreadsPerBlock` by taking minimum among all available devices
int maxThreadsPerBlock = INT_MAX;
hipDeviceProp_t deviceProperties;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
if (!deviceProperties.cooperativeMultiDeviceLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
}
// Test for blockSizes in powers of 2
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
test_cg_multi_grid_group_type(blockSize, nGpu);
}
// Test for random blockSizes, but the sequence is the same every execution
srand(0);
for (int i = 0; i < 10; i++) {
// Test fails for 0 thread per block
test_cg_multi_grid_group_type(max(2, rand() % maxThreadsPerBlock), nGpu);
}
}
+234
ファイルの表示
@@ -0,0 +1,234 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
* TEST: %t
* HIT_END
*/
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
#include <cmath>
#include <cstdlib>
#include <climits>
#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
using namespace cooperative_groups;
constexpr int MaxGPUs = 8;
static __global__
void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD,
int* gridRankTestD,
int *thdRankTestD,
int *isValidTestD,
int *syncTestD,
int *syncResultD)
{
thread_group tg = this_multi_grid(); // This can work if _CG_ABI_EXPERIMENTAL defined on Cuda
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test size
sizeTestD[gIdx] = tg.size();
// Test thread_rank
gridRankTestD[gIdx] = this_multi_grid().grid_rank();
thdRankTestD[gIdx] = tg.thread_rank();
// Test is_valid
#ifdef __HIP_PLATFORM_AMD__
isValidTestD[gIdx] = tg.is_valid();
#else
// Cuda has no thread_group.is_valid()
isValidTestD[gIdx] = true;
#endif
// Test sync
//
// Eech thread assign 1 to their respective location
syncTestD[gIdx] = 1;
// Grid level sync
this_grid().sync();
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
syncTestD[0] += syncTestD[i];
}
syncResultD[this_multi_grid().grid_rank() + 1] = syncTestD[0];
}
// multi-grid level sync
tg.sync();
// grid (gpu) 0 does final reduction across all grids (gpus)
if (this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
syncResultD[0] = 0;
for (uint i = 1; i <= this_multi_grid().num_grids(); ++i) {
syncResultD[0] += syncResultD[i];
}
}
}
static void test_cg_multi_grid_group_type_via_base_type(int blockSize, int nGpu)
{
// Create a stream each device
hipStream_t stream[MaxGPUs];
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device
HIPCHECK(hipStreamCreate(&stream[i]));
}
// Allocate host and device memory
int nBytes = sizeof(int) * 2 * blockSize;
int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
int *syncTestD[MaxGPUs], *syncResultD;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
if (i == 0) {
HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
}
}
// Launch Kernel
constexpr int NumKernelArgs = 6;
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
void* args[MaxGPUs * NumKernelArgs];
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
args[i * NumKernelArgs ] = &sizeTestD[i];
args[i * NumKernelArgs + 1] = &gridRankTestD[i];
args[i * NumKernelArgs + 2] = &thdRankTestD[i];
args[i * NumKernelArgs + 3] = &isValidTestD[i];
args[i * NumKernelArgs + 4] = &syncTestD[i];
args[i * NumKernelArgs + 5] = &syncResultD;
launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_base_type);
launchParamsList[i].gridDim = 2;
launchParamsList[i].blockDim = blockSize;
launchParamsList[i].sharedMem = 0;
launchParamsList[i].stream = stream[i];
launchParamsList[i].args = &args[i * NumKernelArgs];
}
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
// Copy result from device to host
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
}
// Validate results
int gridsSeen[MaxGPUs];
for (int i = 0; i < nGpu; ++i) {
for (int j = 0; j < 2 * blockSize; ++j) {
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
ASSERT_GE(gridRankTestH[i][j], 0);
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
int gridRank = gridRankTestH[i][j];
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
ASSERT_EQUAL(isValidTestH[i][j], 1);
}
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
// Validate uniqueness property of grid rank
gridsSeen[i] = gridRankTestH[i][0];
for (int k = 0; k < i; ++k) {
if (gridsSeen[k] == gridsSeen[i]) {
assert (false && "Grid rank in multi-gpu setup should be unique");
}
}
}
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
// Free host and device memory
delete [] launchParamsList;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipFree(sizeTestD[i]));
HIPCHECK(hipFree(gridRankTestD[i]));
HIPCHECK(hipFree(thdRankTestD[i]));
HIPCHECK(hipFree(isValidTestD[i]));
HIPCHECK(hipFree(syncTestD[i]));
if (i == 0)
HIPCHECK(hipHostFree(syncResultD));
HIPCHECK(hipHostFree(sizeTestH[i]));
HIPCHECK(hipHostFree(gridRankTestH[i]));
HIPCHECK(hipHostFree(thdRankTestH[i]));
HIPCHECK(hipHostFree(isValidTestH[i]));
}
}
TEST_CASE("Unit_hipCGMultiGridGroupType_BaseType") {
// Set `maxThreadsPerBlock` by taking minimum among all available devices
int nGpu = 0;
HIPCHECK(hipGetDeviceCount(&nGpu));
nGpu = min(nGpu, MaxGPUs);
int maxThreadsPerBlock = INT_MAX;
hipDeviceProp_t deviceProperties;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
if (!deviceProperties.cooperativeMultiDeviceLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
}
// Test for blockSizes in powers of 2
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
test_cg_multi_grid_group_type_via_base_type(blockSize, nGpu);
}
// Test for random blockSizes, but the sequence is the same every execution
srand(0);
for (int i = 0; i < 10; i++) {
// Test fails for 0 thread per block
test_cg_multi_grid_group_type_via_base_type(max(2, rand() % maxThreadsPerBlock), nGpu);
}
}
+230
ファイルの表示
@@ -0,0 +1,230 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
* TEST: %t
* HIT_END
*/
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
#include <cmath>
#include <cstdlib>
#include <climits>
#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
using namespace cooperative_groups;
constexpr int MaxGPUs = 8;
static __global__
void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD,
int* gridRankTestD,
int *thdRankTestD,
int *isValidTestD,
int *syncTestD,
int *syncResultD)
{
multi_grid_group mg = this_multi_grid();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test group_size api
sizeTestD[gIdx] = group_size(mg);
// Test thread_rank api
gridRankTestD[gIdx] = this_multi_grid().grid_rank();
thdRankTestD[gIdx] = thread_rank(mg);
// Test is_valid api
isValidTestD[gIdx] = mg.is_valid();
// Test sync api
//
// Eech thread assign 1 to their respective location
syncTestD[gIdx] = 1;
// Grid level sync
sync(this_grid());
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
syncTestD[0] += syncTestD[i];
}
syncResultD[this_multi_grid().grid_rank() + 1] = syncTestD[0];
}
// multi-grid level sync via public api
sync(mg);
// grid (gpu) 0 does final reduction across all grids (gpus)
if (this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
syncResultD[0] = 0;
for (uint i = 1; i <= this_multi_grid().num_grids(); ++i) {
syncResultD[0] += syncResultD[i];
}
}
}
static void test_cg_multi_grid_group_type_via_public_api(int blockSize, int nGpu)
{
// Create a stream each device
hipStream_t stream[MaxGPUs];
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device
HIPCHECK(hipStreamCreate(&stream[i]));
}
// Allocate host and device memory
int nBytes = sizeof(int) * 2 * blockSize;
int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
int *syncTestD[MaxGPUs], *syncResultD;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
if (i == 0) {
HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
}
}
// Launch Kernel
constexpr int NumKernelArgs = 6;
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
void* args[MaxGPUs * NumKernelArgs];
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
args[i * NumKernelArgs ] = &sizeTestD[i];
args[i * NumKernelArgs + 1] = &gridRankTestD[i];
args[i * NumKernelArgs + 2] = &thdRankTestD[i];
args[i * NumKernelArgs + 3] = &isValidTestD[i];
args[i * NumKernelArgs + 4] = &syncTestD[i];
args[i * NumKernelArgs + 5] = &syncResultD;
launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_public_api);
launchParamsList[i].gridDim = 2;
launchParamsList[i].blockDim = blockSize;
launchParamsList[i].sharedMem = 0;
launchParamsList[i].stream = stream[i];
launchParamsList[i].args = &args[i * NumKernelArgs];
}
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
// Copy result from device to host
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
}
// Validate results
int gridsSeen[MaxGPUs];
for (int i = 0; i < nGpu; ++i) {
for (int j = 0; j < 2 * blockSize; ++j) {
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
ASSERT_GE(gridRankTestH[i][j], 0);
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
int gridRank = gridRankTestH[i][j];
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
ASSERT_EQUAL(isValidTestH[i][j], 1);
}
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
// Validate uniqueness property of grid rank
gridsSeen[i] = gridRankTestH[i][0];
for (int k = 0; k < i; ++k) {
if (gridsSeen[k] == gridsSeen[i]) {
assert (false && "Grid rank in multi-gpu setup should be unique");
}
}
}
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
// Free host and device memory
delete [] launchParamsList;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipFree(sizeTestD[i]));
HIPCHECK(hipFree(gridRankTestD[i]));
HIPCHECK(hipFree(thdRankTestD[i]));
HIPCHECK(hipFree(isValidTestD[i]));
HIPCHECK(hipFree(syncTestD[i]));
if (i == 0)
HIPCHECK(hipHostFree(syncResultD));
HIPCHECK(hipHostFree(sizeTestH[i]));
HIPCHECK(hipHostFree(gridRankTestH[i]));
HIPCHECK(hipHostFree(thdRankTestH[i]));
HIPCHECK(hipHostFree(isValidTestH[i]));
}
}
TEST_CASE("Unit_hipCGMultiGridGroupType_PublicApi") {
// Set `maxThreadsPerBlock` by taking minimum among all available devices
int nGpu = 0;
HIPCHECK(hipGetDeviceCount(&nGpu));
nGpu = min(nGpu, MaxGPUs);
int maxThreadsPerBlock = INT_MAX;
hipDeviceProp_t deviceProperties;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
if (!deviceProperties.cooperativeMultiDeviceLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
}
// Test for blockSizes in powers of 2
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
test_cg_multi_grid_group_type_via_public_api(blockSize, nGpu);
}
// Test for random blockSizes, but the sequence is the same every execution
srand(0);
for (int i = 0; i < 10; i++) {
// Test fails for 0 thread per block
test_cg_multi_grid_group_type_via_public_api(max(2, rand() % maxThreadsPerBlock), nGpu);
}
}
+164
ファイルの表示
@@ -0,0 +1,164 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
#include <cstdlib>
#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
using namespace cooperative_groups;
static __global__
void kernel_cg_thread_block_type(int *sizeTestD,
int *thdRankTestD,
int *syncTestD,
dim3 *groupIndexTestD,
dim3 *thdIndexTestD)
{
thread_block tb = this_thread_block();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test size
sizeTestD[gIdx] = tb.size();
// Test thread_rank
thdRankTestD[gIdx] = tb.thread_rank();
// Test sync
__shared__ int sm[2];
if (threadIdx.x == 0)
sm[0] = 10;
else if (threadIdx.x == 1)
sm[1] = 20;
tb.sync();
syncTestD[gIdx] = sm[1] * sm[0];
// Test group_index
groupIndexTestD[gIdx] = tb.group_index();
// Test thread_index
thdIndexTestD[gIdx] = tb.thread_index();
}
static void test_cg_thread_block_type(int blockSize)
{
int nBytes = sizeof(int) * 2 * blockSize;
int nDim3Bytes = sizeof(dim3) * 2 * blockSize;
int *sizeTestD, *sizeTestH;
int *thdRankTestD, *thdRankTestH;
int *syncTestD, *syncTestH;
dim3 *groupIndexTestD, *groupIndexTestH;
dim3 *thdIndexTestD, *thdIndexTestH;
// Allocate device memory
HIPCHECK(hipMalloc(&sizeTestD, nBytes));
HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
HIPCHECK(hipMalloc(&syncTestD, nBytes));
HIPCHECK(hipMalloc(&groupIndexTestD, nDim3Bytes));
HIPCHECK(hipMalloc(&thdIndexTestD, nDim3Bytes));
// Allocate host memory
HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
HIPCHECK(hipHostMalloc(&groupIndexTestH, nDim3Bytes));
HIPCHECK(hipHostMalloc(&thdIndexTestH, nDim3Bytes));
// Launch Kernel
hipLaunchKernelGGL(kernel_cg_thread_block_type,
2,
blockSize,
0,
0,
sizeTestD,
thdRankTestD,
syncTestD,
groupIndexTestD,
thdIndexTestD);
// Copy result from device to host
HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(groupIndexTestH, groupIndexTestD, nDim3Bytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(thdIndexTestH, thdIndexTestD, nDim3Bytes, hipMemcpyDeviceToHost));
// Validate results for both blocks together
for (int i = 0; i < 2 * blockSize; ++i) {
ASSERT_EQUAL(sizeTestH[i], blockSize);
ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
ASSERT_EQUAL(syncTestH[i], 200);
ASSERT_EQUAL(groupIndexTestH[i].x, (uint) i / blockSize);
ASSERT_EQUAL(groupIndexTestH[i].y, 0);
ASSERT_EQUAL(groupIndexTestH[i].z, 0);
ASSERT_EQUAL(thdIndexTestH[i].x, (uint) i % blockSize);
ASSERT_EQUAL(thdIndexTestH[i].y, 0);
ASSERT_EQUAL(thdIndexTestH[i].z, 0);
}
// Free device memory
HIPCHECK(hipFree(sizeTestD));
HIPCHECK(hipFree(thdRankTestD));
HIPCHECK(hipFree(syncTestD));
HIPCHECK(hipFree(groupIndexTestD));
HIPCHECK(hipFree(thdIndexTestD));
//Free host memory
HIPCHECK(hipHostFree(sizeTestH));
HIPCHECK(hipHostFree(thdRankTestH));
HIPCHECK(hipHostFree(syncTestH));
HIPCHECK(hipHostFree(groupIndexTestH));
HIPCHECK(hipHostFree(thdIndexTestH));
}
TEST_CASE("Unit_hipCGThreadBlockType") {
// Use default device for validating the test
int deviceId;
hipDeviceProp_t deviceProperties;
HIPCHECK(hipGetDevice(&deviceId));
HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
if (!deviceProperties.cooperativeLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
// Test for blockSizes in powers of 2
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
test_cg_thread_block_type(blockSize);
}
// Test for random blockSizes, but the sequence is the same every execution
srand(0);
for (int i = 0; i < 10; i++) {
// Test fails for only 1 thread per block
test_cg_thread_block_type(max(2, rand() % maxThreadsPerBlock));
}
}
+136
ファイルの表示
@@ -0,0 +1,136 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
#include <hip_test_common.hh>
#include "hip/hip_cooperative_groups.h"
#include <cstdlib>
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
using namespace cooperative_groups;
static __global__
void kernel_cg_thread_block_type_via_base_type(int *sizeTestD,
int *thdRankTestD,
int *syncTestD)
{
thread_group tg = this_thread_block();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test size
sizeTestD[gIdx] = tg.size();
// Test thread_rank
thdRankTestD[gIdx] = tg.thread_rank();
// Test sync
__shared__ int sm[2];
if (threadIdx.x == 0)
sm[0] = 10;
else if (threadIdx.x == 1)
sm[1] = 20;
tg.sync();
syncTestD[gIdx] = sm[1] * sm[0];
}
static void test_cg_thread_block_type_via_base_type(int blockSize)
{
int nBytes = sizeof(int) * 2 * blockSize;
int *sizeTestD, *sizeTestH;
int *thdRankTestD, *thdRankTestH;
int *syncTestD, *syncTestH;
// Allocate device memory
HIPCHECK(hipMalloc(&sizeTestD, nBytes));
HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
HIPCHECK(hipMalloc(&syncTestD, nBytes));
// Allocate host memory
HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
// Launch Kernel
hipLaunchKernelGGL(kernel_cg_thread_block_type_via_base_type,
2,
blockSize,
0,
0,
sizeTestD,
thdRankTestD,
syncTestD);
// Copy result from device to host
HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
// Validate results for both blocks together
for (int i = 0; i < 2 * blockSize; ++i) {
ASSERT_EQUAL(sizeTestH[i], blockSize);
ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
ASSERT_EQUAL(syncTestH[i], 200);
}
// Free device memory
HIPCHECK(hipFree(sizeTestD));
HIPCHECK(hipFree(thdRankTestD));
HIPCHECK(hipFree(syncTestD));
//Free host memory
HIPCHECK(hipHostFree(sizeTestH));
HIPCHECK(hipHostFree(thdRankTestH));
HIPCHECK(hipHostFree(syncTestH));
}
TEST_CASE("Unit_hipCGThreadBlockType_BaseType") {
// Use default device for validating the test
int deviceId;
hipDeviceProp_t deviceProperties;
HIPCHECK(hipGetDevice(&deviceId));
HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
if (!deviceProperties.cooperativeLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
// Test for blockSizes in powers of 2
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
test_cg_thread_block_type_via_base_type(blockSize);
}
// Test for random blockSizes, but the sequence is the same every execution
srand(0);
for (int i = 0; i < 10; i++) {
// Test fails for only 1 thread per block
test_cg_thread_block_type_via_base_type(max(2, rand() % maxThreadsPerBlock));
}
}
+136
ファイルの表示
@@ -0,0 +1,136 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
#include <hip_test_common.hh>
#include "hip/hip_cooperative_groups.h"
#include <cstdlib>
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
using namespace cooperative_groups;
static __global__
void kernel_cg_thread_block_type_via_public_api(int *sizeTestD,
int *thdRankTestD,
int *syncTestD)
{
thread_block tb = this_thread_block();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test group_size api
sizeTestD[gIdx] = group_size(tb);
// Test thread_rank api
thdRankTestD[gIdx] = thread_rank(tb);
// Test sync api
__shared__ int sm[2];
if (threadIdx.x == 0)
sm[0] = 10;
else if (threadIdx.x == 1)
sm[1] = 20;
sync(tb);
syncTestD[gIdx] = sm[1] * sm[0];
}
static void test_cg_thread_block_type_via_public_api(int blockSize)
{
int nBytes = sizeof(int) * 2 * blockSize;
int *sizeTestD, *sizeTestH;
int *thdRankTestD, *thdRankTestH;
int *syncTestD, *syncTestH;
// Allocate device memory
HIPCHECK(hipMalloc(&sizeTestD, nBytes));
HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
HIPCHECK(hipMalloc(&syncTestD, nBytes));
// Allocate host memory
HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
// Launch Kernel
hipLaunchKernelGGL(kernel_cg_thread_block_type_via_public_api,
2,
blockSize,
0,
0,
sizeTestD,
thdRankTestD,
syncTestD);
// Copy result from device to host
HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
// Validate results for both blocks together
for (int i = 0; i < 2 * blockSize; ++i) {
ASSERT_EQUAL(sizeTestH[i], blockSize);
ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
ASSERT_EQUAL(syncTestH[i], 200);
}
// Free device memory
HIPCHECK(hipFree(sizeTestD));
HIPCHECK(hipFree(thdRankTestD));
HIPCHECK(hipFree(syncTestD));
//Free host memory
HIPCHECK(hipHostFree(sizeTestH));
HIPCHECK(hipHostFree(thdRankTestH));
HIPCHECK(hipHostFree(syncTestH));
}
TEST_CASE("Unit_hipCGThreadBlockType_PublicApi") {
// Use default device for validating the test
int deviceId;
hipDeviceProp_t deviceProperties;
HIPCHECK(hipGetDevice(&deviceId));
HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
if (!deviceProperties.cooperativeLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
// Test for blockSizes in powers of 2
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
test_cg_thread_block_type_via_public_api(blockSize);
}
// Test for random blockSizes, but the sequence is the same every execution
srand(0);
for (int i = 0; i < 10; i++) {
// Test fails for only 1 thread per block
test_cg_thread_block_type_via_public_api(max(2, rand() % maxThreadsPerBlock));
}
}