SWDEV-348282 - fixed intermittent failure fo hipCGMultiGrid* and hipCGThread* tests, porting over to catch2 in the process (#3067)
Change-Id: Id22dfb4ab2ee8171242fbf6a10886ff3e4abc926
[ROCm/hip-tests commit: 74a6e407e7]
このコミットが含まれているのは:
@@ -33,6 +33,7 @@ add_subdirectory(kernel)
|
||||
add_subdirectory(multiThread)
|
||||
add_subdirectory(compiler)
|
||||
add_subdirectory(errorHandling)
|
||||
add_subdirectory(cooperativeGrps)
|
||||
if(HIP_PLATFORM STREQUAL "amd")
|
||||
add_subdirectory(clock)
|
||||
endif()
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
# Common Tests - Test independent of all platforms
|
||||
set(TEST_SRC
|
||||
hipCGThreadBlockType.cc
|
||||
hipCGThreadBlockTypeViaBaseType.cc
|
||||
hipCGThreadBlockTypeViaPublicApi.cc
|
||||
hipCGMultiGridGroupType.cc
|
||||
hipCGMultiGridGroupTypeViaBaseType.cc
|
||||
hipCGMultiGridGroupTypeViaPublicApi.cc
|
||||
)
|
||||
if(HIP_PLATFORM STREQUAL "nvidia")
|
||||
set_source_files_properties(hipCGMultiGridGroupType.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
|
||||
set_source_files_properties(hipCGMultiGridGroupTypeViaBaseType.cc PROPERTIES COMPILE_FLAGS "-D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
|
||||
set_source_files_properties(hipCGMultiGridGroupTypeViaPublicApi.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
|
||||
hip_add_exe_to_target(NAME coopGrpTest
|
||||
TEST_SRC ${TEST_SRC}
|
||||
TEST_TARGET_NAME build_tests
|
||||
LINKER_LIBS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
|
||||
else()
|
||||
hip_add_exe_to_target(NAME coopGrpTest
|
||||
TEST_SRC ${TEST_SRC}
|
||||
TEST_TARGET_NAME build_tests)
|
||||
endif()
|
||||
@@ -0,0 +1,240 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
|
||||
#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
|
||||
#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
|
||||
#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
|
||||
|
||||
using namespace cooperative_groups;
|
||||
constexpr int MaxGPUs = 8;
|
||||
|
||||
static __global__
|
||||
void kernel_cg_multi_grid_group_type(int* numGridsTestD,
|
||||
int* gridRankTestD,
|
||||
int *sizeTestD,
|
||||
int *thdRankTestD,
|
||||
int *isValidTestD,
|
||||
int *syncTestD,
|
||||
int *syncResultD)
|
||||
{
|
||||
multi_grid_group mg = this_multi_grid();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test num_grids
|
||||
numGridsTestD[gIdx] = mg.num_grids();
|
||||
|
||||
// Test grid_rank
|
||||
gridRankTestD[gIdx] = mg.grid_rank();
|
||||
|
||||
// Test size
|
||||
sizeTestD[gIdx] = mg.size();
|
||||
|
||||
// Test thread_rank
|
||||
thdRankTestD[gIdx] = mg.thread_rank();
|
||||
|
||||
// Test is_valid
|
||||
isValidTestD[gIdx] = mg.is_valid();
|
||||
|
||||
// Test sync
|
||||
//
|
||||
// Eech thread assign 1 to their respective location
|
||||
syncTestD[gIdx] = 1;
|
||||
// Grid level sync
|
||||
this_grid().sync();
|
||||
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
|
||||
if (blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
|
||||
syncTestD[0] += syncTestD[i];
|
||||
}
|
||||
syncResultD[mg.grid_rank() + 1] = syncTestD[0];
|
||||
}
|
||||
// multi-grid level sync
|
||||
mg.sync();
|
||||
// grid (gpu) 0 does final reduction across all grids (gpus)
|
||||
if (mg.grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
syncResultD[0] = 0;
|
||||
for (uint i = 1; i <= mg.num_grids(); ++i) {
|
||||
syncResultD[0] += syncResultD[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void test_cg_multi_grid_group_type(int blockSize, int nGpu)
|
||||
{
|
||||
// Create a stream each device
|
||||
hipStream_t stream[MaxGPUs];
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device
|
||||
HIPCHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
|
||||
// Allocate host and device memory
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int *numGridsTestD[MaxGPUs], *numGridsTestH[MaxGPUs];
|
||||
int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
|
||||
int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
|
||||
int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
|
||||
int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
|
||||
int *syncTestD[MaxGPUs], *syncResultD;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
HIPCHECK(hipMalloc(&numGridsTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
|
||||
|
||||
HIPCHECK(hipHostMalloc(&numGridsTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
|
||||
|
||||
if (i == 0) {
|
||||
HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
|
||||
}
|
||||
}
|
||||
|
||||
// Launch Kernel
|
||||
constexpr int NumKernelArgs = 7;
|
||||
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
|
||||
void* args[MaxGPUs * NumKernelArgs];
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
args[i * NumKernelArgs] = &numGridsTestD[i];
|
||||
args[i * NumKernelArgs + 1] = &gridRankTestD[i];
|
||||
args[i * NumKernelArgs + 2] = &sizeTestD[i];
|
||||
args[i * NumKernelArgs + 3] = &thdRankTestD[i];
|
||||
args[i * NumKernelArgs + 4] = &isValidTestD[i];
|
||||
args[i * NumKernelArgs + 5] = &syncTestD[i];
|
||||
args[i * NumKernelArgs + 6] = &syncResultD;
|
||||
|
||||
launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type);
|
||||
launchParamsList[i].gridDim = 2;
|
||||
launchParamsList[i].blockDim = blockSize;
|
||||
launchParamsList[i].sharedMem = 0;
|
||||
launchParamsList[i].stream = stream[i];
|
||||
launchParamsList[i].args = &args[i * NumKernelArgs];
|
||||
}
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
|
||||
|
||||
// Copy result from device to host
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
HIPCHECK(hipMemcpy(numGridsTestH[i], numGridsTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
// Validate results
|
||||
int gridsSeen[MaxGPUs];
|
||||
for (int i = 0; i < nGpu; ++i) {
|
||||
for (int j = 0; j < 2 * blockSize; ++j) {
|
||||
ASSERT_EQUAL(numGridsTestH[i][j], nGpu);
|
||||
ASSERT_GE(gridRankTestH[i][j], 0);
|
||||
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
|
||||
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
|
||||
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
|
||||
int gridRank = gridRankTestH[i][j];
|
||||
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
|
||||
ASSERT_EQUAL(isValidTestH[i][j], 1);
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
|
||||
|
||||
// Validate uniqueness property of grid rank
|
||||
gridsSeen[i] = gridRankTestH[i][0];
|
||||
for (int k = 0; k < i; ++k) {
|
||||
if (gridsSeen[k] == gridsSeen[i]) {
|
||||
assert(false && "Grid rank in multi-gpu setup should be unique");
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
|
||||
|
||||
// Free host and device memory
|
||||
delete [] launchParamsList;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
HIPCHECK(hipFree(numGridsTestD[i]));
|
||||
HIPCHECK(hipFree(gridRankTestD[i]));
|
||||
HIPCHECK(hipFree(sizeTestD[i]));
|
||||
HIPCHECK(hipFree(thdRankTestD[i]));
|
||||
HIPCHECK(hipFree(isValidTestD[i]));
|
||||
HIPCHECK(hipFree(syncTestD[i]));
|
||||
|
||||
if (i == 0) {
|
||||
HIPCHECK(hipHostFree(syncResultD));
|
||||
}
|
||||
HIPCHECK(hipHostFree(numGridsTestH[i]));
|
||||
HIPCHECK(hipHostFree(gridRankTestH[i]));
|
||||
HIPCHECK(hipHostFree(sizeTestH[i]));
|
||||
HIPCHECK(hipHostFree(thdRankTestH[i]));
|
||||
HIPCHECK(hipHostFree(isValidTestH[i]));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGMultiGridGroupType") {
|
||||
int nGpu = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&nGpu));
|
||||
nGpu = min(nGpu, MaxGPUs);
|
||||
|
||||
// Set `maxThreadsPerBlock` by taking minimum among all available devices
|
||||
int maxThreadsPerBlock = INT_MAX;
|
||||
hipDeviceProp_t deviceProperties;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
|
||||
if (!deviceProperties.cooperativeMultiDeviceLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
|
||||
}
|
||||
|
||||
// Test for blockSizes in powers of 2
|
||||
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
|
||||
test_cg_multi_grid_group_type(blockSize, nGpu);
|
||||
}
|
||||
|
||||
// Test for random blockSizes, but the sequence is the same every execution
|
||||
srand(0);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
// Test fails for 0 thread per block
|
||||
test_cg_multi_grid_group_type(max(2, rand() % maxThreadsPerBlock), nGpu);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,234 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <climits>
|
||||
|
||||
#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
|
||||
#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
|
||||
#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
|
||||
|
||||
using namespace cooperative_groups;
|
||||
constexpr int MaxGPUs = 8;
|
||||
|
||||
static __global__
|
||||
void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD,
|
||||
int* gridRankTestD,
|
||||
int *thdRankTestD,
|
||||
int *isValidTestD,
|
||||
int *syncTestD,
|
||||
int *syncResultD)
|
||||
{
|
||||
thread_group tg = this_multi_grid(); // This can work if _CG_ABI_EXPERIMENTAL defined on Cuda
|
||||
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test size
|
||||
sizeTestD[gIdx] = tg.size();
|
||||
|
||||
// Test thread_rank
|
||||
gridRankTestD[gIdx] = this_multi_grid().grid_rank();
|
||||
thdRankTestD[gIdx] = tg.thread_rank();
|
||||
|
||||
// Test is_valid
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
isValidTestD[gIdx] = tg.is_valid();
|
||||
#else
|
||||
// Cuda has no thread_group.is_valid()
|
||||
isValidTestD[gIdx] = true;
|
||||
#endif
|
||||
// Test sync
|
||||
//
|
||||
// Eech thread assign 1 to their respective location
|
||||
syncTestD[gIdx] = 1;
|
||||
// Grid level sync
|
||||
this_grid().sync();
|
||||
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
|
||||
if (blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
|
||||
syncTestD[0] += syncTestD[i];
|
||||
}
|
||||
syncResultD[this_multi_grid().grid_rank() + 1] = syncTestD[0];
|
||||
}
|
||||
// multi-grid level sync
|
||||
tg.sync();
|
||||
// grid (gpu) 0 does final reduction across all grids (gpus)
|
||||
if (this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
syncResultD[0] = 0;
|
||||
for (uint i = 1; i <= this_multi_grid().num_grids(); ++i) {
|
||||
syncResultD[0] += syncResultD[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void test_cg_multi_grid_group_type_via_base_type(int blockSize, int nGpu)
|
||||
{
|
||||
// Create a stream each device
|
||||
hipStream_t stream[MaxGPUs];
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device
|
||||
HIPCHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
|
||||
// Allocate host and device memory
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
|
||||
int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
|
||||
int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
|
||||
int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
|
||||
int *syncTestD[MaxGPUs], *syncResultD;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
|
||||
|
||||
HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
|
||||
|
||||
if (i == 0) {
|
||||
HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
|
||||
}
|
||||
}
|
||||
|
||||
// Launch Kernel
|
||||
constexpr int NumKernelArgs = 6;
|
||||
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
|
||||
void* args[MaxGPUs * NumKernelArgs];
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
args[i * NumKernelArgs ] = &sizeTestD[i];
|
||||
args[i * NumKernelArgs + 1] = &gridRankTestD[i];
|
||||
args[i * NumKernelArgs + 2] = &thdRankTestD[i];
|
||||
args[i * NumKernelArgs + 3] = &isValidTestD[i];
|
||||
args[i * NumKernelArgs + 4] = &syncTestD[i];
|
||||
args[i * NumKernelArgs + 5] = &syncResultD;
|
||||
|
||||
launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_base_type);
|
||||
launchParamsList[i].gridDim = 2;
|
||||
launchParamsList[i].blockDim = blockSize;
|
||||
launchParamsList[i].sharedMem = 0;
|
||||
launchParamsList[i].stream = stream[i];
|
||||
launchParamsList[i].args = &args[i * NumKernelArgs];
|
||||
}
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
|
||||
|
||||
// Copy result from device to host
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
// Validate results
|
||||
int gridsSeen[MaxGPUs];
|
||||
for (int i = 0; i < nGpu; ++i) {
|
||||
for (int j = 0; j < 2 * blockSize; ++j) {
|
||||
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
|
||||
ASSERT_GE(gridRankTestH[i][j], 0);
|
||||
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
|
||||
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
|
||||
int gridRank = gridRankTestH[i][j];
|
||||
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
|
||||
ASSERT_EQUAL(isValidTestH[i][j], 1);
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
|
||||
|
||||
// Validate uniqueness property of grid rank
|
||||
gridsSeen[i] = gridRankTestH[i][0];
|
||||
for (int k = 0; k < i; ++k) {
|
||||
if (gridsSeen[k] == gridsSeen[i]) {
|
||||
assert (false && "Grid rank in multi-gpu setup should be unique");
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
|
||||
|
||||
// Free host and device memory
|
||||
delete [] launchParamsList;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
HIPCHECK(hipFree(sizeTestD[i]));
|
||||
HIPCHECK(hipFree(gridRankTestD[i]));
|
||||
HIPCHECK(hipFree(thdRankTestD[i]));
|
||||
HIPCHECK(hipFree(isValidTestD[i]));
|
||||
HIPCHECK(hipFree(syncTestD[i]));
|
||||
|
||||
if (i == 0)
|
||||
HIPCHECK(hipHostFree(syncResultD));
|
||||
|
||||
HIPCHECK(hipHostFree(sizeTestH[i]));
|
||||
HIPCHECK(hipHostFree(gridRankTestH[i]));
|
||||
HIPCHECK(hipHostFree(thdRankTestH[i]));
|
||||
HIPCHECK(hipHostFree(isValidTestH[i]));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGMultiGridGroupType_BaseType") {
|
||||
// Set `maxThreadsPerBlock` by taking minimum among all available devices
|
||||
int nGpu = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&nGpu));
|
||||
nGpu = min(nGpu, MaxGPUs);
|
||||
|
||||
int maxThreadsPerBlock = INT_MAX;
|
||||
hipDeviceProp_t deviceProperties;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
|
||||
if (!deviceProperties.cooperativeMultiDeviceLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
|
||||
}
|
||||
|
||||
// Test for blockSizes in powers of 2
|
||||
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
|
||||
test_cg_multi_grid_group_type_via_base_type(blockSize, nGpu);
|
||||
}
|
||||
|
||||
// Test for random blockSizes, but the sequence is the same every execution
|
||||
srand(0);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
// Test fails for 0 thread per block
|
||||
test_cg_multi_grid_group_type_via_base_type(max(2, rand() % maxThreadsPerBlock), nGpu);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,230 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <climits>
|
||||
|
||||
#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
|
||||
#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
|
||||
#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
|
||||
|
||||
using namespace cooperative_groups;
|
||||
constexpr int MaxGPUs = 8;
|
||||
|
||||
static __global__
|
||||
void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD,
|
||||
int* gridRankTestD,
|
||||
int *thdRankTestD,
|
||||
int *isValidTestD,
|
||||
int *syncTestD,
|
||||
int *syncResultD)
|
||||
{
|
||||
multi_grid_group mg = this_multi_grid();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test group_size api
|
||||
sizeTestD[gIdx] = group_size(mg);
|
||||
|
||||
// Test thread_rank api
|
||||
gridRankTestD[gIdx] = this_multi_grid().grid_rank();
|
||||
thdRankTestD[gIdx] = thread_rank(mg);
|
||||
|
||||
// Test is_valid api
|
||||
isValidTestD[gIdx] = mg.is_valid();
|
||||
|
||||
// Test sync api
|
||||
//
|
||||
// Eech thread assign 1 to their respective location
|
||||
syncTestD[gIdx] = 1;
|
||||
// Grid level sync
|
||||
sync(this_grid());
|
||||
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
|
||||
if (blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
|
||||
syncTestD[0] += syncTestD[i];
|
||||
}
|
||||
syncResultD[this_multi_grid().grid_rank() + 1] = syncTestD[0];
|
||||
}
|
||||
// multi-grid level sync via public api
|
||||
sync(mg);
|
||||
// grid (gpu) 0 does final reduction across all grids (gpus)
|
||||
if (this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
syncResultD[0] = 0;
|
||||
for (uint i = 1; i <= this_multi_grid().num_grids(); ++i) {
|
||||
syncResultD[0] += syncResultD[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void test_cg_multi_grid_group_type_via_public_api(int blockSize, int nGpu)
|
||||
{
|
||||
// Create a stream each device
|
||||
hipStream_t stream[MaxGPUs];
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device
|
||||
HIPCHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
|
||||
// Allocate host and device memory
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
|
||||
int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
|
||||
int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
|
||||
int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
|
||||
int *syncTestD[MaxGPUs], *syncResultD;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
|
||||
|
||||
HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
|
||||
|
||||
if (i == 0) {
|
||||
HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
|
||||
}
|
||||
}
|
||||
|
||||
// Launch Kernel
|
||||
constexpr int NumKernelArgs = 6;
|
||||
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
|
||||
void* args[MaxGPUs * NumKernelArgs];
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
args[i * NumKernelArgs ] = &sizeTestD[i];
|
||||
args[i * NumKernelArgs + 1] = &gridRankTestD[i];
|
||||
args[i * NumKernelArgs + 2] = &thdRankTestD[i];
|
||||
args[i * NumKernelArgs + 3] = &isValidTestD[i];
|
||||
args[i * NumKernelArgs + 4] = &syncTestD[i];
|
||||
args[i * NumKernelArgs + 5] = &syncResultD;
|
||||
|
||||
launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_public_api);
|
||||
launchParamsList[i].gridDim = 2;
|
||||
launchParamsList[i].blockDim = blockSize;
|
||||
launchParamsList[i].sharedMem = 0;
|
||||
launchParamsList[i].stream = stream[i];
|
||||
launchParamsList[i].args = &args[i * NumKernelArgs];
|
||||
}
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
|
||||
|
||||
// Copy result from device to host
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
// Validate results
|
||||
int gridsSeen[MaxGPUs];
|
||||
for (int i = 0; i < nGpu; ++i) {
|
||||
for (int j = 0; j < 2 * blockSize; ++j) {
|
||||
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
|
||||
ASSERT_GE(gridRankTestH[i][j], 0);
|
||||
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
|
||||
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
|
||||
int gridRank = gridRankTestH[i][j];
|
||||
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
|
||||
ASSERT_EQUAL(isValidTestH[i][j], 1);
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
|
||||
|
||||
// Validate uniqueness property of grid rank
|
||||
gridsSeen[i] = gridRankTestH[i][0];
|
||||
for (int k = 0; k < i; ++k) {
|
||||
if (gridsSeen[k] == gridsSeen[i]) {
|
||||
assert (false && "Grid rank in multi-gpu setup should be unique");
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
|
||||
|
||||
// Free host and device memory
|
||||
delete [] launchParamsList;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
HIPCHECK(hipFree(sizeTestD[i]));
|
||||
HIPCHECK(hipFree(gridRankTestD[i]));
|
||||
HIPCHECK(hipFree(thdRankTestD[i]));
|
||||
HIPCHECK(hipFree(isValidTestD[i]));
|
||||
HIPCHECK(hipFree(syncTestD[i]));
|
||||
|
||||
if (i == 0)
|
||||
HIPCHECK(hipHostFree(syncResultD));
|
||||
|
||||
HIPCHECK(hipHostFree(sizeTestH[i]));
|
||||
HIPCHECK(hipHostFree(gridRankTestH[i]));
|
||||
HIPCHECK(hipHostFree(thdRankTestH[i]));
|
||||
HIPCHECK(hipHostFree(isValidTestH[i]));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGMultiGridGroupType_PublicApi") {
|
||||
// Set `maxThreadsPerBlock` by taking minimum among all available devices
|
||||
int nGpu = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&nGpu));
|
||||
nGpu = min(nGpu, MaxGPUs);
|
||||
|
||||
int maxThreadsPerBlock = INT_MAX;
|
||||
hipDeviceProp_t deviceProperties;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
|
||||
if (!deviceProperties.cooperativeMultiDeviceLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
|
||||
}
|
||||
|
||||
// Test for blockSizes in powers of 2
|
||||
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
|
||||
test_cg_multi_grid_group_type_via_public_api(blockSize, nGpu);
|
||||
}
|
||||
|
||||
// Test for random blockSizes, but the sequence is the same every execution
|
||||
srand(0);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
// Test fails for 0 thread per block
|
||||
test_cg_multi_grid_group_type_via_public_api(max(2, rand() % maxThreadsPerBlock), nGpu);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,164 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include <cstdlib>
|
||||
|
||||
#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
|
||||
|
||||
using namespace cooperative_groups;
|
||||
|
||||
static __global__
|
||||
void kernel_cg_thread_block_type(int *sizeTestD,
|
||||
int *thdRankTestD,
|
||||
int *syncTestD,
|
||||
dim3 *groupIndexTestD,
|
||||
dim3 *thdIndexTestD)
|
||||
{
|
||||
thread_block tb = this_thread_block();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
// Test size
|
||||
sizeTestD[gIdx] = tb.size();
|
||||
|
||||
// Test thread_rank
|
||||
thdRankTestD[gIdx] = tb.thread_rank();
|
||||
|
||||
// Test sync
|
||||
__shared__ int sm[2];
|
||||
if (threadIdx.x == 0)
|
||||
sm[0] = 10;
|
||||
else if (threadIdx.x == 1)
|
||||
sm[1] = 20;
|
||||
tb.sync();
|
||||
syncTestD[gIdx] = sm[1] * sm[0];
|
||||
|
||||
// Test group_index
|
||||
groupIndexTestD[gIdx] = tb.group_index();
|
||||
|
||||
// Test thread_index
|
||||
thdIndexTestD[gIdx] = tb.thread_index();
|
||||
}
|
||||
|
||||
static void test_cg_thread_block_type(int blockSize)
|
||||
{
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int nDim3Bytes = sizeof(dim3) * 2 * blockSize;
|
||||
int *sizeTestD, *sizeTestH;
|
||||
int *thdRankTestD, *thdRankTestH;
|
||||
int *syncTestD, *syncTestH;
|
||||
dim3 *groupIndexTestD, *groupIndexTestH;
|
||||
dim3 *thdIndexTestD, *thdIndexTestH;
|
||||
|
||||
// Allocate device memory
|
||||
HIPCHECK(hipMalloc(&sizeTestD, nBytes));
|
||||
HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
|
||||
HIPCHECK(hipMalloc(&syncTestD, nBytes));
|
||||
HIPCHECK(hipMalloc(&groupIndexTestD, nDim3Bytes));
|
||||
HIPCHECK(hipMalloc(&thdIndexTestD, nDim3Bytes));
|
||||
|
||||
// Allocate host memory
|
||||
HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
|
||||
HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
|
||||
HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
|
||||
HIPCHECK(hipHostMalloc(&groupIndexTestH, nDim3Bytes));
|
||||
HIPCHECK(hipHostMalloc(&thdIndexTestH, nDim3Bytes));
|
||||
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_cg_thread_block_type,
|
||||
2,
|
||||
blockSize,
|
||||
0,
|
||||
0,
|
||||
sizeTestD,
|
||||
thdRankTestD,
|
||||
syncTestD,
|
||||
groupIndexTestD,
|
||||
thdIndexTestD);
|
||||
|
||||
// Copy result from device to host
|
||||
HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(groupIndexTestH, groupIndexTestD, nDim3Bytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(thdIndexTestH, thdIndexTestD, nDim3Bytes, hipMemcpyDeviceToHost));
|
||||
|
||||
// Validate results for both blocks together
|
||||
for (int i = 0; i < 2 * blockSize; ++i) {
|
||||
ASSERT_EQUAL(sizeTestH[i], blockSize);
|
||||
ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
|
||||
ASSERT_EQUAL(syncTestH[i], 200);
|
||||
ASSERT_EQUAL(groupIndexTestH[i].x, (uint) i / blockSize);
|
||||
ASSERT_EQUAL(groupIndexTestH[i].y, 0);
|
||||
ASSERT_EQUAL(groupIndexTestH[i].z, 0);
|
||||
ASSERT_EQUAL(thdIndexTestH[i].x, (uint) i % blockSize);
|
||||
ASSERT_EQUAL(thdIndexTestH[i].y, 0);
|
||||
ASSERT_EQUAL(thdIndexTestH[i].z, 0);
|
||||
}
|
||||
|
||||
// Free device memory
|
||||
HIPCHECK(hipFree(sizeTestD));
|
||||
HIPCHECK(hipFree(thdRankTestD));
|
||||
HIPCHECK(hipFree(syncTestD));
|
||||
HIPCHECK(hipFree(groupIndexTestD));
|
||||
HIPCHECK(hipFree(thdIndexTestD));
|
||||
|
||||
//Free host memory
|
||||
HIPCHECK(hipHostFree(sizeTestH));
|
||||
HIPCHECK(hipHostFree(thdRankTestH));
|
||||
HIPCHECK(hipHostFree(syncTestH));
|
||||
HIPCHECK(hipHostFree(groupIndexTestH));
|
||||
HIPCHECK(hipHostFree(thdIndexTestH));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGThreadBlockType") {
|
||||
// Use default device for validating the test
|
||||
int deviceId;
|
||||
hipDeviceProp_t deviceProperties;
|
||||
HIPCHECK(hipGetDevice(&deviceId));
|
||||
HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
|
||||
|
||||
if (!deviceProperties.cooperativeLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
|
||||
// Test for blockSizes in powers of 2
|
||||
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
|
||||
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
|
||||
test_cg_thread_block_type(blockSize);
|
||||
}
|
||||
|
||||
// Test for random blockSizes, but the sequence is the same every execution
|
||||
srand(0);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
// Test fails for only 1 thread per block
|
||||
test_cg_thread_block_type(max(2, rand() % maxThreadsPerBlock));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,136 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include "hip/hip_cooperative_groups.h"
|
||||
#include <cstdlib>
|
||||
|
||||
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
|
||||
|
||||
using namespace cooperative_groups;
|
||||
|
||||
static __global__
|
||||
void kernel_cg_thread_block_type_via_base_type(int *sizeTestD,
|
||||
int *thdRankTestD,
|
||||
int *syncTestD)
|
||||
{
|
||||
thread_group tg = this_thread_block();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test size
|
||||
sizeTestD[gIdx] = tg.size();
|
||||
|
||||
// Test thread_rank
|
||||
thdRankTestD[gIdx] = tg.thread_rank();
|
||||
|
||||
// Test sync
|
||||
__shared__ int sm[2];
|
||||
if (threadIdx.x == 0)
|
||||
sm[0] = 10;
|
||||
else if (threadIdx.x == 1)
|
||||
sm[1] = 20;
|
||||
tg.sync();
|
||||
syncTestD[gIdx] = sm[1] * sm[0];
|
||||
}
|
||||
|
||||
static void test_cg_thread_block_type_via_base_type(int blockSize)
|
||||
{
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int *sizeTestD, *sizeTestH;
|
||||
int *thdRankTestD, *thdRankTestH;
|
||||
int *syncTestD, *syncTestH;
|
||||
|
||||
// Allocate device memory
|
||||
HIPCHECK(hipMalloc(&sizeTestD, nBytes));
|
||||
HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
|
||||
HIPCHECK(hipMalloc(&syncTestD, nBytes));
|
||||
|
||||
// Allocate host memory
|
||||
HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
|
||||
HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
|
||||
HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
|
||||
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_cg_thread_block_type_via_base_type,
|
||||
2,
|
||||
blockSize,
|
||||
0,
|
||||
0,
|
||||
sizeTestD,
|
||||
thdRankTestD,
|
||||
syncTestD);
|
||||
|
||||
// Copy result from device to host
|
||||
HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
|
||||
// Validate results for both blocks together
|
||||
for (int i = 0; i < 2 * blockSize; ++i) {
|
||||
ASSERT_EQUAL(sizeTestH[i], blockSize);
|
||||
ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
|
||||
ASSERT_EQUAL(syncTestH[i], 200);
|
||||
}
|
||||
|
||||
// Free device memory
|
||||
HIPCHECK(hipFree(sizeTestD));
|
||||
HIPCHECK(hipFree(thdRankTestD));
|
||||
HIPCHECK(hipFree(syncTestD));
|
||||
|
||||
//Free host memory
|
||||
HIPCHECK(hipHostFree(sizeTestH));
|
||||
HIPCHECK(hipHostFree(thdRankTestH));
|
||||
HIPCHECK(hipHostFree(syncTestH));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGThreadBlockType_BaseType") {
|
||||
// Use default device for validating the test
|
||||
int deviceId;
|
||||
hipDeviceProp_t deviceProperties;
|
||||
HIPCHECK(hipGetDevice(&deviceId));
|
||||
HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
|
||||
|
||||
if (!deviceProperties.cooperativeLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
|
||||
// Test for blockSizes in powers of 2
|
||||
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
|
||||
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
|
||||
test_cg_thread_block_type_via_base_type(blockSize);
|
||||
}
|
||||
|
||||
// Test for random blockSizes, but the sequence is the same every execution
|
||||
srand(0);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
// Test fails for only 1 thread per block
|
||||
test_cg_thread_block_type_via_base_type(max(2, rand() % maxThreadsPerBlock));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,136 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include "hip/hip_cooperative_groups.h"
|
||||
#include <cstdlib>
|
||||
|
||||
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
|
||||
|
||||
using namespace cooperative_groups;
|
||||
|
||||
static __global__
|
||||
void kernel_cg_thread_block_type_via_public_api(int *sizeTestD,
|
||||
int *thdRankTestD,
|
||||
int *syncTestD)
|
||||
{
|
||||
thread_block tb = this_thread_block();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test group_size api
|
||||
sizeTestD[gIdx] = group_size(tb);
|
||||
|
||||
// Test thread_rank api
|
||||
thdRankTestD[gIdx] = thread_rank(tb);
|
||||
|
||||
// Test sync api
|
||||
__shared__ int sm[2];
|
||||
if (threadIdx.x == 0)
|
||||
sm[0] = 10;
|
||||
else if (threadIdx.x == 1)
|
||||
sm[1] = 20;
|
||||
sync(tb);
|
||||
syncTestD[gIdx] = sm[1] * sm[0];
|
||||
}
|
||||
|
||||
static void test_cg_thread_block_type_via_public_api(int blockSize)
|
||||
{
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int *sizeTestD, *sizeTestH;
|
||||
int *thdRankTestD, *thdRankTestH;
|
||||
int *syncTestD, *syncTestH;
|
||||
|
||||
// Allocate device memory
|
||||
HIPCHECK(hipMalloc(&sizeTestD, nBytes));
|
||||
HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
|
||||
HIPCHECK(hipMalloc(&syncTestD, nBytes));
|
||||
|
||||
// Allocate host memory
|
||||
HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
|
||||
HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
|
||||
HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
|
||||
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_cg_thread_block_type_via_public_api,
|
||||
2,
|
||||
blockSize,
|
||||
0,
|
||||
0,
|
||||
sizeTestD,
|
||||
thdRankTestD,
|
||||
syncTestD);
|
||||
|
||||
// Copy result from device to host
|
||||
HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
|
||||
// Validate results for both blocks together
|
||||
for (int i = 0; i < 2 * blockSize; ++i) {
|
||||
ASSERT_EQUAL(sizeTestH[i], blockSize);
|
||||
ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
|
||||
ASSERT_EQUAL(syncTestH[i], 200);
|
||||
}
|
||||
|
||||
// Free device memory
|
||||
HIPCHECK(hipFree(sizeTestD));
|
||||
HIPCHECK(hipFree(thdRankTestD));
|
||||
HIPCHECK(hipFree(syncTestD));
|
||||
|
||||
//Free host memory
|
||||
HIPCHECK(hipHostFree(sizeTestH));
|
||||
HIPCHECK(hipHostFree(thdRankTestH));
|
||||
HIPCHECK(hipHostFree(syncTestH));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGThreadBlockType_PublicApi") {
|
||||
// Use default device for validating the test
|
||||
int deviceId;
|
||||
hipDeviceProp_t deviceProperties;
|
||||
HIPCHECK(hipGetDevice(&deviceId));
|
||||
HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
|
||||
|
||||
if (!deviceProperties.cooperativeLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
|
||||
// Test for blockSizes in powers of 2
|
||||
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
|
||||
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
|
||||
test_cg_thread_block_type_via_public_api(blockSize);
|
||||
}
|
||||
|
||||
// Test for random blockSizes, but the sequence is the same every execution
|
||||
srand(0);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
// Test fails for only 1 thread per block
|
||||
test_cg_thread_block_type_via_public_api(max(2, rand() % maxThreadsPerBlock));
|
||||
}
|
||||
}
|
||||
新しいイシューから参照
ユーザーをブロックする