SWDEV-1 - Merge github PRs to amd-staging
- https://github.com/ROCm/hip-tests/pull/194
- https://github.com/ROCm/hip-tests/pull/36
- https://github.com/ROCm/hip-tests/pull/44
- https://github.com/ROCm/hip-tests/pull/47
- https://github.com/ROCm/hip-tests/pull/62
- https://github.com/ROCm/hip-tests/pull/63
- https://github.com/ROCm/hip-tests/pull/64
- https://github.com/ROCm/hip-tests/pull/65
- https://github.com/ROCm/hip-tests/pull/66
- https://github.com/ROCm/hip-tests/pull/67
- https://github.com/ROCm/hip-tests/pull/68
- https://github.com/ROCm/hip-tests/pull/69
- https://github.com/ROCm/hip-tests/pull/142
- https://github.com/ROCm/hip-tests/pull/196
- https://github.com/ROCm/hip-tests/pull/238
Change-Id: I74f7fef76d7d536b1cf89dad3e527c92d1cd21b5
[ROCm/hip-tests commit: 6429ef1b60]
이 커밋은 다음에 포함됨:
@@ -54,6 +54,8 @@
|
||||
"Unit_hipFuncSetAttribute_Positive_MaxDynamicSharedMemorySize_Not_Supported",
|
||||
"Unit_hipFuncSetAttribute_Positive_PreferredSharedMemoryCarveout_Not_Supported",
|
||||
"Unit_hipOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters",
|
||||
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters",
|
||||
"Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters",
|
||||
"Unit_hipGraphMemcpyNodeSetParamsToSymbol_Positive_Basic",
|
||||
"Unit_hipGraphExecMemcpyNodeSetParamsToSymbol_Positive_Basic",
|
||||
"Unit_hipGraphExecMemcpyNodeSetParamsFromSymbol_Positive_Basic",
|
||||
@@ -183,6 +185,62 @@
|
||||
"Unit_hipMemUnmap_negative",
|
||||
"=== SWDEV-432556,SWDEV-434211:Below test randomly failing in stress test ===",
|
||||
"Unit_hipDeviceGetUuid_From_RocmInfo",
|
||||
"=== SWDEV-434171: Below tests took long time to complete in stress test on 17/11/23 ===",
|
||||
"Unit_Warp_Shfl_Positive_Basic - int",
|
||||
"Unit_Warp_Shfl_Positive_Basic - unsigned int",
|
||||
"Unit_Warp_Shfl_Positive_Basic - long",
|
||||
"Unit_Warp_Shfl_Positive_Basic - unsigned long",
|
||||
"Unit_Warp_Shfl_Positive_Basic - long long",
|
||||
"Unit_Warp_Shfl_Positive_Basic - unsigned long long",
|
||||
"Unit_Warp_Shfl_Positive_Basic - float",
|
||||
"Unit_Warp_Shfl_Positive_Basic - double",
|
||||
"Unit_Warp_Shfl_XOR_Positive_Basic - int",
|
||||
"Unit_Warp_Shfl_XOR_Positive_Basic - unsigned int",
|
||||
"Unit_Warp_Shfl_XOR_Positive_Basic - long",
|
||||
"Unit_Warp_Shfl_XOR_Positive_Basic - unsigned long",
|
||||
"Unit_Warp_Shfl_XOR_Positive_Basic - long long",
|
||||
"Unit_Warp_Shfl_XOR_Positive_Basic - unsigned long long",
|
||||
"Unit_Warp_Shfl_XOR_Positive_Basic - float",
|
||||
"Unit_Warp_Shfl_XOR_Positive_Basic - double",
|
||||
"=== SWDEV-434878: Below tests failed in stress test on 24/11/23 ===",
|
||||
"Unit_hipGraphUpload_Negative_Parameters",
|
||||
"Unit_hipModuleOccupancyMaxPotentialBlockSize_Negative_Parameters",
|
||||
"Unit_hipModuleOccupancyMaxPotentialBlockSize_Positive_RangeValidation",
|
||||
"Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Positive_RangeValidation",
|
||||
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters",
|
||||
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Positive_RangeValidation",
|
||||
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Positive_RangeValidation",
|
||||
"=== SWDEV-435667: Below tests failing randomly in stress test on 01/12/23 ===",
|
||||
"Unit_atomicExch_Positive_Same_Address_Compile_Time - int",
|
||||
"Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned int",
|
||||
"Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long",
|
||||
"Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long long",
|
||||
"Unit_atomicExch_Positive_Same_Address_Compile_Time - float",
|
||||
"Unit_atomicExch_Positive_Same_Address_Compile_Time - double",
|
||||
"Unit_atomicExch_Positive_Multi_Kernel - int",
|
||||
"Unit_atomicExch_Positive_Multi_Kernel - unsigned int",
|
||||
"Unit_atomicExch_Positive_Multi_Kernel - unsigned long",
|
||||
"Unit_atomicExch_Positive_Multi_Kernel - unsigned long long",
|
||||
"Unit_atomicExch_Positive_Multi_Kernel - float",
|
||||
"Unit_atomicExch_Positive_Multi_Kernel - double",
|
||||
"Unit_atomicExch_system_Positive_Peer_GPUs - int",
|
||||
"Unit_atomicExch_system_Positive_Peer_GPUs - unsigned int",
|
||||
"Unit_atomicExch_system_Positive_Peer_GPUs - unsigned long",
|
||||
"Unit_atomicExch_system_Positive_Peer_GPUs - unsigned long long",
|
||||
"Unit_atomicExch_system_Positive_Peer_GPUs - float",
|
||||
"Unit_atomicExch_system_Positive_Peer_GPUs - double",
|
||||
"Unit_atomicExch_system_Positive_Host_And_GPU - int",
|
||||
"Unit_atomicExch_system_Positive_Host_And_GPU - unsigned int",
|
||||
"Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long",
|
||||
"Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long long",
|
||||
"Unit_atomicExch_system_Positive_Host_And_GPU - float",
|
||||
"Unit_atomicExch_system_Positive_Host_And_GPU - double",
|
||||
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - int",
|
||||
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned int",
|
||||
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned long",
|
||||
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned long long",
|
||||
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - float",
|
||||
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - double",
|
||||
#endif
|
||||
#if defined VEGA20
|
||||
"=== SWDEV-419112 Below tests fail in stress test on 29/08/23 ===",
|
||||
|
||||
@@ -119,6 +119,8 @@
|
||||
"Unit_hipFuncSetAttribute_Positive_MaxDynamicSharedMemorySize_Not_Supported",
|
||||
"Unit_hipFuncSetAttribute_Positive_PreferredSharedMemoryCarveout_Not_Supported",
|
||||
"Unit_hipOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters",
|
||||
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters",
|
||||
"Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters",
|
||||
"Unit_hipGraphMemcpyNodeSetParamsToSymbol_Positive_Basic",
|
||||
"Unit_hipGraphExecMemcpyNodeSetParamsToSymbol_Positive_Basic",
|
||||
"Unit_hipGraphMemcpyNodeSetParamsFromSymbol_Positive_Basic",
|
||||
@@ -282,6 +284,62 @@
|
||||
"Unit_hipMemSetAccess_MultiProc",
|
||||
"Unit_hipMemSetAccess_negative",
|
||||
"Unit_hipMemUnmap_negative",
|
||||
"=== SWDEV-434171: Below tests took long time to complete in stress test on 17/11/23 ===",
|
||||
"Unit_Warp_Shfl_Positive_Basic - int",
|
||||
"Unit_Warp_Shfl_Positive_Basic - unsigned int",
|
||||
"Unit_Warp_Shfl_Positive_Basic - long",
|
||||
"Unit_Warp_Shfl_Positive_Basic - unsigned long",
|
||||
"Unit_Warp_Shfl_Positive_Basic - long long",
|
||||
"Unit_Warp_Shfl_Positive_Basic - unsigned long long",
|
||||
"Unit_Warp_Shfl_Positive_Basic - float",
|
||||
"Unit_Warp_Shfl_Positive_Basic - double",
|
||||
"Unit_Warp_Shfl_XOR_Positive_Basic - int",
|
||||
"Unit_Warp_Shfl_XOR_Positive_Basic - unsigned int",
|
||||
"Unit_Warp_Shfl_XOR_Positive_Basic - long",
|
||||
"Unit_Warp_Shfl_XOR_Positive_Basic - unsigned long",
|
||||
"Unit_Warp_Shfl_XOR_Positive_Basic - long long",
|
||||
"Unit_Warp_Shfl_XOR_Positive_Basic - unsigned long long",
|
||||
"Unit_Warp_Shfl_XOR_Positive_Basic - float",
|
||||
"Unit_Warp_Shfl_XOR_Positive_Basic - double",
|
||||
"=== SWDEV-434878: Below tests failed in stress test on 24/11/23 ===",
|
||||
"Unit_hipGraphUpload_Negative_Parameters",
|
||||
"Unit_hipModuleOccupancyMaxPotentialBlockSize_Negative_Parameters",
|
||||
"Unit_hipModuleOccupancyMaxPotentialBlockSize_Positive_RangeValidation",
|
||||
"Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Positive_RangeValidation",
|
||||
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters",
|
||||
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Positive_RangeValidation",
|
||||
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Positive_RangeValidation",
|
||||
"=== SWDEV-435667: Below tests failing randomly in stress test on 01/12/23 ===",
|
||||
"Unit_atomicExch_Positive_Same_Address_Compile_Time - int",
|
||||
"Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned int",
|
||||
"Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long",
|
||||
"Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long long",
|
||||
"Unit_atomicExch_Positive_Same_Address_Compile_Time - float",
|
||||
"Unit_atomicExch_Positive_Same_Address_Compile_Time - double",
|
||||
"Unit_atomicExch_Positive_Multi_Kernel - int",
|
||||
"Unit_atomicExch_Positive_Multi_Kernel - unsigned int",
|
||||
"Unit_atomicExch_Positive_Multi_Kernel - unsigned long",
|
||||
"Unit_atomicExch_Positive_Multi_Kernel - unsigned long long",
|
||||
"Unit_atomicExch_Positive_Multi_Kernel - float",
|
||||
"Unit_atomicExch_Positive_Multi_Kernel - double",
|
||||
"Unit_atomicExch_system_Positive_Peer_GPUs - int",
|
||||
"Unit_atomicExch_system_Positive_Peer_GPUs - unsigned int",
|
||||
"Unit_atomicExch_system_Positive_Peer_GPUs - unsigned long",
|
||||
"Unit_atomicExch_system_Positive_Peer_GPUs - unsigned long long",
|
||||
"Unit_atomicExch_system_Positive_Peer_GPUs - float",
|
||||
"Unit_atomicExch_system_Positive_Peer_GPUs - double",
|
||||
"Unit_atomicExch_system_Positive_Host_And_GPU - int",
|
||||
"Unit_atomicExch_system_Positive_Host_And_GPU - unsigned int",
|
||||
"Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long",
|
||||
"Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long long",
|
||||
"Unit_atomicExch_system_Positive_Host_And_GPU - float",
|
||||
"Unit_atomicExch_system_Positive_Host_And_GPU - double",
|
||||
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - int",
|
||||
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned int",
|
||||
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned long",
|
||||
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned long long",
|
||||
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - float",
|
||||
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - double",
|
||||
#endif
|
||||
"End of json"
|
||||
]
|
||||
|
||||
@@ -44,6 +44,14 @@
|
||||
"Grid_Group_Getters_Via_Non_Member_Functions_Positive_Basic",
|
||||
"Grid_Group_Sync_Positive_Basic",
|
||||
"dynamic_loading_device_kernels_from_library",
|
||||
"Unit_tiled_partition"
|
||||
"Unit_tiled_partition",
|
||||
"Unit_atomicExch_Positive_Same_Address_Compile_Time - int",
|
||||
"Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned int",
|
||||
"Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long long",
|
||||
"Unit_atomicExch_Positive_Same_Address_Compile_Time - float",
|
||||
"Unit_atomicExch_system_Positive_Host_And_GPU - int",
|
||||
"Unit_atomicExch_system_Positive_Host_And_GPU - unsigned int",
|
||||
"Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long long",
|
||||
"Unit_atomicExch_system_Positive_Host_And_GPU - float"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -30,9 +30,9 @@ int main(int argc, char** argv) {
|
||||
| Opt(cmd_options.progress)
|
||||
["-P"]["--progress"]
|
||||
("Show progress bar when running performance tests")
|
||||
| Opt(cmd_options.extended_run)
|
||||
["-E"]["--extended-run"]
|
||||
("TODO: Description goes here")
|
||||
| Opt(cmd_options.cg_extended_run, "cg_extened_run")
|
||||
["-E"]["--cg-extended-run"]
|
||||
("Number of iterations used for cooperative groups sync tests (default: 5)")
|
||||
;
|
||||
// clang-format on
|
||||
|
||||
|
||||
@@ -23,11 +23,11 @@ THE SOFTWARE.
|
||||
#pragma once
|
||||
|
||||
struct CmdOptions {
|
||||
int iterations = 1000;
|
||||
int iterations = 10;
|
||||
int warmups = 100;
|
||||
int cg_extended_run = 5;
|
||||
bool no_display = false;
|
||||
bool progress = false;
|
||||
bool extended_run = false;
|
||||
};
|
||||
|
||||
extern CmdOptions cmd_options;
|
||||
@@ -78,6 +78,7 @@ struct CPUGrid {
|
||||
unsigned int thread_count_;
|
||||
};
|
||||
|
||||
/* Generate dimensions for 1D, 2D and 3D blocks of threads */
|
||||
inline dim3 GenerateThreadDimensions() {
|
||||
hipDeviceProp_t props;
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, 0));
|
||||
@@ -99,6 +100,7 @@ inline dim3 GenerateThreadDimensions() {
|
||||
dim3(props.warpSize + 1, 3, 3));
|
||||
}
|
||||
|
||||
/* Generate dimensions for 1D, 2D and 3D grids of blocks */
|
||||
inline dim3 GenerateBlockDimensions() {
|
||||
hipDeviceProp_t props;
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, 0));
|
||||
@@ -116,6 +118,7 @@ inline dim3 GenerateBlockDimensions() {
|
||||
dim3(5, 5, 5));
|
||||
}
|
||||
|
||||
/* Generate dimensions for 1D, 2D and 3D blocks of threads - reduced set */
|
||||
inline dim3 GenerateThreadDimensionsForShuffle() {
|
||||
hipDeviceProp_t props;
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, 0));
|
||||
@@ -136,6 +139,7 @@ inline dim3 GenerateThreadDimensionsForShuffle() {
|
||||
dim3(props.warpSize + 1, 3, 3));
|
||||
}
|
||||
|
||||
/* Generate dimensions for 1D, 2D and 3D grids of blocks - reduced set */
|
||||
inline dim3 GenerateBlockDimensionsForShuffle() {
|
||||
hipDeviceProp_t props;
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, 0));
|
||||
|
||||
@@ -102,6 +102,19 @@ THE SOFTWARE.
|
||||
} \
|
||||
}
|
||||
|
||||
// Check that an expression, errorExpr, evaluates to the expected error_t, expectedError.
|
||||
#define HIPRTC_CHECK_ERROR(errorExpr, expectedError) \
|
||||
{ \
|
||||
auto localError = errorExpr; \
|
||||
INFO("Matching Errors: " \
|
||||
<< "\n Expected Error: " << hiprtcGetErrorString(expectedError) \
|
||||
<< "\n Expected Code: " << expectedError << '\n' \
|
||||
<< " Actual Error: " << hiprtcGetErrorString(localError) \
|
||||
<< "\n Actual Code: " << localError << "\nStr: " << #errorExpr \
|
||||
<< "\n In File: " << __FILE__ << "\n At line: " << __LINE__); \
|
||||
REQUIRE(localError == expectedError); \
|
||||
}
|
||||
|
||||
// Although its assert, it will be evaluated at runtime
|
||||
#define HIP_ASSERT(x) \
|
||||
{ REQUIRE((x)); }
|
||||
|
||||
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
// Test groups are named based on the group names from hip_api_runtime.h, with adding "Test" suffix
|
||||
|
||||
/**
|
||||
@@ -95,8 +97,46 @@ THE SOFTWARE.
|
||||
|
||||
/**
|
||||
* @defgroup KernelTest Kernel Functions Management
|
||||
* @{
|
||||
* This section describes the various kernel functions invocation.
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* @defgroup AtomicsTest Device Atomics
|
||||
* @{
|
||||
* This section describes the various kernel functions invocation.
|
||||
* This section describes tests for the Device Atomic APIs.
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* @addtogroup atomicExch atomicExch
|
||||
* @{
|
||||
* @ingroup AtomicsTest
|
||||
*/
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Compiles atomicExch with invalid parameters.
|
||||
* - Compiles the source with specialized Python tool.
|
||||
* -# Utilizes sub-process to invoke compilation of faulty source.
|
||||
* -# Performs post-processing of compiler output and counts errors.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/atomics/CMakeLists.txt
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Unit_atomicExch_Negative_Parameters") {}
|
||||
/**
|
||||
* End doxygen group atomicExch.
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* End doxygen group AtomicsTest.
|
||||
* @}
|
||||
*/
|
||||
|
||||
@@ -115,7 +155,14 @@ THE SOFTWARE.
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
/**
|
||||
* @defgroup PerformanceTest Performance tests
|
||||
* @{
|
||||
* This section describes performance tests for the target API groups and use-cases.
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* @defgroup ShflTest warp shuffle function Management
|
||||
* @{
|
||||
* This section describes the warp shuffle types & functions of HIP runtime API.
|
||||
|
||||
@@ -34,6 +34,7 @@ THE SOFTWARE.
|
||||
#include <resource_guards.hh>
|
||||
|
||||
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
||||
#pragma clang diagnostic ignored "-Wunused-parameter"
|
||||
#pragma clang diagnostic ignored "-Wunused-function"
|
||||
|
||||
#if defined(_WIN32)
|
||||
|
||||
@@ -29,10 +29,30 @@ enum class LinearAllocs {
|
||||
hipHostMalloc,
|
||||
hipMalloc,
|
||||
hipMallocManaged,
|
||||
noAlloc
|
||||
};
|
||||
|
||||
inline std::string to_string(const LinearAllocs allocation_type) {
|
||||
switch (allocation_type) {
|
||||
case LinearAllocs::malloc:
|
||||
return "host pageable";
|
||||
case LinearAllocs::mallocAndRegister:
|
||||
return "registered";
|
||||
case LinearAllocs::hipHostMalloc:
|
||||
return "host pinned";
|
||||
case LinearAllocs::hipMalloc:
|
||||
return "device malloc";
|
||||
case LinearAllocs::hipMallocManaged:
|
||||
return "managed";
|
||||
default:
|
||||
return "unknown alloc type";
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T> class LinearAllocGuard {
|
||||
public:
|
||||
LinearAllocGuard() = default;
|
||||
|
||||
LinearAllocGuard(const LinearAllocs allocation_type, const size_t size,
|
||||
const unsigned int flags = 0u)
|
||||
: allocation_type_{allocation_type} {
|
||||
@@ -55,15 +75,36 @@ template <typename T> class LinearAllocGuard {
|
||||
case LinearAllocs::hipMallocManaged:
|
||||
HIP_CHECK(hipMallocManaged(reinterpret_cast<void**>(&ptr_), size, flags ? flags : 1u));
|
||||
host_ptr_ = ptr_;
|
||||
break;
|
||||
case LinearAllocs::noAlloc:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
LinearAllocGuard(const LinearAllocGuard&) = delete;
|
||||
LinearAllocGuard(LinearAllocGuard&&) = delete;
|
||||
|
||||
LinearAllocGuard(LinearAllocGuard&& o)
|
||||
: allocation_type_{o.allocation_type_}, ptr_{o.ptr_}, host_ptr_{o.host_ptr_} {
|
||||
o.allocation_type_ = LinearAllocs::noAlloc;
|
||||
o.ptr_ = nullptr;
|
||||
o.host_ptr_ = nullptr;
|
||||
}
|
||||
|
||||
LinearAllocGuard& operator=(LinearAllocGuard&& o) {
|
||||
allocation_type_ = o.allocation_type_;
|
||||
ptr_ = o.ptr_;
|
||||
host_ptr_ = o.host_ptr_;
|
||||
|
||||
o.allocation_type_ = LinearAllocs::noAlloc;
|
||||
o.ptr_ = nullptr;
|
||||
o.host_ptr_ = nullptr;
|
||||
}
|
||||
|
||||
~LinearAllocGuard() {
|
||||
// No Catch macros, don't want to possibly throw in the destructor
|
||||
switch (allocation_type_) {
|
||||
case LinearAllocs::noAlloc:
|
||||
break;
|
||||
case LinearAllocs::malloc:
|
||||
free(ptr_);
|
||||
break;
|
||||
@@ -85,7 +126,7 @@ template <typename T> class LinearAllocGuard {
|
||||
T* host_ptr() const { return host_ptr_; }
|
||||
|
||||
private:
|
||||
const LinearAllocs allocation_type_;
|
||||
LinearAllocs allocation_type_ = LinearAllocs::noAlloc;
|
||||
T* ptr_ = nullptr;
|
||||
T* host_ptr_ = nullptr;
|
||||
};
|
||||
@@ -200,7 +241,10 @@ enum class Streams { nullstream, perThread, created, withFlags, withPriority };
|
||||
|
||||
class StreamGuard {
|
||||
public:
|
||||
StreamGuard(const Streams stream_type, unsigned int flags = hipStreamDefault, int priority = 0) : stream_type_{stream_type}, flags_{flags}, priority_{priority} {
|
||||
StreamGuard() = default;
|
||||
|
||||
StreamGuard(const Streams stream_type, unsigned int flags = hipStreamDefault, int priority = 0)
|
||||
: stream_type_{stream_type}, flags_{flags}, priority_{priority} {
|
||||
switch (stream_type_) {
|
||||
case Streams::nullstream:
|
||||
stream_ = nullptr;
|
||||
@@ -219,7 +263,28 @@ class StreamGuard {
|
||||
}
|
||||
|
||||
StreamGuard(const StreamGuard&) = delete;
|
||||
StreamGuard(StreamGuard&&) = delete;
|
||||
|
||||
StreamGuard(StreamGuard&& o)
|
||||
: stream_type_{o.stream_type_}, flags_{o.flags_}, priority_{o.priority_}, stream_{o.stream_} {
|
||||
o.stream_type_ = Streams::nullstream;
|
||||
o.flags_ = 0u;
|
||||
o.priority_ = 0;
|
||||
o.stream_ = nullptr;
|
||||
}
|
||||
|
||||
StreamGuard& operator=(StreamGuard&& o) {
|
||||
stream_type_ = o.stream_type_;
|
||||
flags_ = o.flags_;
|
||||
priority_ = o.priority_;
|
||||
stream_ = o.stream_;
|
||||
|
||||
o.stream_type_ = Streams::nullstream;
|
||||
o.flags_ = 0u;
|
||||
o.priority_ = 0;
|
||||
o.stream_ = nullptr;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
~StreamGuard() {
|
||||
if (stream_type_ == Streams::created) {
|
||||
@@ -230,23 +295,23 @@ class StreamGuard {
|
||||
hipStream_t stream() const { return stream_; }
|
||||
|
||||
private:
|
||||
const Streams stream_type_;
|
||||
unsigned int flags_;
|
||||
int priority_;
|
||||
hipStream_t stream_;
|
||||
Streams stream_type_ = Streams::nullstream;
|
||||
unsigned int flags_ = 0u;
|
||||
int priority_ = 0;
|
||||
hipStream_t stream_ = nullptr;
|
||||
};
|
||||
|
||||
class EventsGuard {
|
||||
public:
|
||||
public:
|
||||
EventsGuard(size_t N) : events_(N) {
|
||||
for (auto &e : events_) HIP_CHECK(hipEventCreate(&e));
|
||||
for (auto& e : events_) HIP_CHECK(hipEventCreate(&e));
|
||||
}
|
||||
|
||||
EventsGuard(const EventsGuard&) = delete;
|
||||
EventsGuard(EventsGuard&&) = delete;
|
||||
|
||||
~EventsGuard() {
|
||||
for (auto &e : events_) static_cast<void>(hipEventDestroy(e));
|
||||
for (auto& e : events_) static_cast<void>(hipEventDestroy(e));
|
||||
}
|
||||
|
||||
hipEvent_t& operator[](int index) { return events_[index]; }
|
||||
@@ -255,21 +320,21 @@ public:
|
||||
|
||||
std::vector<hipEvent_t>& event_list() { return events_; }
|
||||
|
||||
private:
|
||||
private:
|
||||
std::vector<hipEvent_t> events_;
|
||||
};
|
||||
|
||||
class StreamsGuard {
|
||||
public:
|
||||
public:
|
||||
StreamsGuard(size_t N) : streams_(N) {
|
||||
for (auto &s : streams_) HIP_CHECK(hipStreamCreate(&s));
|
||||
for (auto& s : streams_) HIP_CHECK(hipStreamCreate(&s));
|
||||
}
|
||||
|
||||
StreamsGuard(const StreamsGuard&) = delete;
|
||||
StreamsGuard(StreamsGuard&&) = delete;
|
||||
|
||||
~StreamsGuard() {
|
||||
for (auto &s : streams_) static_cast<void>(hipStreamDestroy(s));
|
||||
for (auto& s : streams_) static_cast<void>(hipStreamDestroy(s));
|
||||
}
|
||||
|
||||
hipStream_t& operator[](int index) { return streams_[index]; }
|
||||
@@ -278,6 +343,6 @@ public:
|
||||
|
||||
std::vector<hipStream_t>& stream_list() { return streams_; }
|
||||
|
||||
private:
|
||||
private:
|
||||
std::vector<hipStream_t> streams_;
|
||||
};
|
||||
|
||||
@@ -18,5 +18,6 @@
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
add_subdirectory(stream)
|
||||
add_subdirectory(event)
|
||||
add_subdirectory(example)
|
||||
|
||||
@@ -0,0 +1,63 @@
|
||||
# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
if(HIP_PLATFORM MATCHES "amd")
|
||||
set(TEST_SRC
|
||||
hipStreamWaitEvent.cc
|
||||
hipStreamGetFlags.cc
|
||||
hipStreamGetPriority.cc
|
||||
hipExtStreamCreateWithCUMask.cc
|
||||
hipExtStreamGetCUMask.cc
|
||||
hipStreamAddCallback.cc
|
||||
hipStreamWaitValue.cc
|
||||
hipStreamWriteValue.cc
|
||||
hipMallocAsync.cc
|
||||
hipFreeAsync.cc
|
||||
hipMemPoolCreate.cc
|
||||
hipMemPoolDestroy.cc
|
||||
hipMemPoolTrimTo.cc
|
||||
hipMemPoolSetAttribute.cc
|
||||
hipMemPoolGetAttribute.cc
|
||||
hipMemPoolSetAccess.cc
|
||||
hipMallocFromPoolAsync.cc
|
||||
hipMemPoolExportToShareableHandle.cc
|
||||
hipMemPoolImportFromShareableHandle.cc
|
||||
hipMemPoolExportPointer.cc
|
||||
hipMemPoolImportPointer.cc
|
||||
hipStreamBasic.cc
|
||||
)
|
||||
else()
|
||||
set(TEST_SRC
|
||||
hipStreamWaitEvent.cc
|
||||
hipStreamGetFlags.cc
|
||||
hipStreamGetPriority.cc
|
||||
hipStreamAddCallback.cc
|
||||
hipStreamWaitValue.cc
|
||||
hipStreamWriteValue.cc
|
||||
hipMallocAsync.cc
|
||||
hipFreeAsync.cc
|
||||
hipStreamBasic.cc
|
||||
)
|
||||
endif()
|
||||
|
||||
hip_add_exe_to_target(NAME StreamPerformance
|
||||
TEST_SRC ${TEST_SRC}
|
||||
TEST_TARGET_NAME build_tests
|
||||
COMPILE_OPTIONS -std=c++17)
|
||||
@@ -0,0 +1,65 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <performance_common.hh>
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
* Contains performance tests for all stream management HIP APIs.
|
||||
*/
|
||||
|
||||
class ExtStreamCreateWithCUMaskBenchmark : public Benchmark<ExtStreamCreateWithCUMaskBenchmark> {
|
||||
public:
|
||||
void operator()() {
|
||||
hipDeviceProp_t props;
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, 0));
|
||||
std::vector<uint32_t> cu_mask(props.multiProcessorCount, 0);
|
||||
hipStream_t stream{};
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipExtStreamCreateWithCUMask(&stream, cu_mask.size(), cu_mask.data()));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark() {
|
||||
ExtStreamCreateWithCUMaskBenchmark benchmark;
|
||||
benchmark.Run();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipExtStreamCreateWithCUMask`.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipExtStreamCreateWithCUMask.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Platform specific (AMD)
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipExtStreamCreateWithCUMask") {
|
||||
RunBenchmark();
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <performance_common.hh>
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class ExtStreamGetCUMaskBenchmark : public Benchmark<ExtStreamGetCUMaskBenchmark> {
|
||||
public:
|
||||
void operator()() {
|
||||
hipDeviceProp_t props;
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, 0));
|
||||
std::vector<uint32_t> cu_mask(props.multiProcessorCount, 0);
|
||||
hipStream_t stream{};
|
||||
HIP_CHECK(hipExtStreamCreateWithCUMask(&stream, cu_mask.size(), cu_mask.data()));
|
||||
std::vector<uint32_t> new_cu_mask(cu_mask.size(), 0);
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipExtStreamGetCUMask(stream, new_cu_mask.size(), new_cu_mask.data()));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark() {
|
||||
ExtStreamGetCUMaskBenchmark benchmark;
|
||||
benchmark.Run();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipExtStreamGetCUMask`.
|
||||
* - Creates basic mask and gets it into the new one.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipExtStreamGetCUMask.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Platform specific (AMD)
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipExtStreamGetCUMask") {
|
||||
RunBenchmark();
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <performance_common.hh>
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class FreeAsyncBenchmark : public Benchmark<FreeAsyncBenchmark> {
|
||||
public:
|
||||
void operator()(const size_t array_size) {
|
||||
const StreamGuard stream_guard{Streams::created};
|
||||
const hipStream_t stream = stream_guard.stream();
|
||||
float* dev_ptr{nullptr};
|
||||
HIP_CHECK(hipMallocAsync(reinterpret_cast<void**>(&dev_ptr), array_size * sizeof(float), stream));
|
||||
|
||||
TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
|
||||
HIP_CHECK(hipFreeAsync(dev_ptr, stream));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark(const size_t array_size) {
|
||||
FreeAsyncBenchmark benchmark;
|
||||
benchmark.AddSectionName(std::to_string(array_size));
|
||||
benchmark.Run(array_size);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipFreeAsync` with created stream:
|
||||
* -# Allocation size:
|
||||
* - 4 KB
|
||||
* - 4 MB
|
||||
* - 16 MB
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipFreeAsync.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipFreeAsync") {
|
||||
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
|
||||
RunBenchmark(array_size);
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <performance_common.hh>
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class MallocAsyncBenchmark : public Benchmark<MallocAsyncBenchmark> {
|
||||
public:
|
||||
void operator()(const size_t array_size) {
|
||||
const StreamGuard stream_guard{Streams::created};
|
||||
const hipStream_t stream = stream_guard.stream();
|
||||
float* dev_ptr{nullptr};
|
||||
|
||||
TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
|
||||
HIP_CHECK(hipMallocAsync(reinterpret_cast<void**>(&dev_ptr), array_size * sizeof(float), stream));
|
||||
}
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
HIP_CHECK(hipFree(dev_ptr));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark(const size_t array_size) {
|
||||
MallocAsyncBenchmark benchmark;
|
||||
benchmark.AddSectionName(std::to_string(array_size));
|
||||
benchmark.Run(array_size);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipMallocAsync` with created stream:
|
||||
* -# Allocation size:
|
||||
* - 4 KB
|
||||
* - 4 MB
|
||||
* - 16 MB
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipMallocAsync.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipMallocAsync") {
|
||||
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
|
||||
RunBenchmark(array_size);
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "mem_pools_performance_common.hh"
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class MallocFromPoolAsyncBenchmark : public Benchmark<MallocFromPoolAsyncBenchmark> {
|
||||
public:
|
||||
void operator()(const size_t array_size) {
|
||||
const StreamGuard stream_guard{Streams::created};
|
||||
const hipStream_t stream = stream_guard.stream();
|
||||
|
||||
hipMemPool_t mem_pool{nullptr};
|
||||
hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
|
||||
HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
|
||||
|
||||
float* array_ptr{nullptr};
|
||||
|
||||
TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
|
||||
HIP_CHECK(hipMallocFromPoolAsync(&array_ptr, array_size * sizeof(float), mem_pool, stream));
|
||||
}
|
||||
|
||||
REQUIRE(array_ptr != nullptr);
|
||||
|
||||
HIP_CHECK(hipFreeAsync(array_ptr, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
HIP_CHECK(hipMemPoolDestroy(mem_pool));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark(const size_t array_size) {
|
||||
MallocFromPoolAsyncBenchmark benchmark;
|
||||
benchmark.AddSectionName(std::to_string(array_size));
|
||||
benchmark.Run(array_size);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipMallocFromPoolAsync`:
|
||||
* -# Allocation size:
|
||||
* - 4 KB
|
||||
* - 4 MB
|
||||
* - 16 MB
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipMallocFromPoolAsync.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Device supports memory pools
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipMallocFromPoolAsync") {
|
||||
if (!AreMemPoolsSupported(0)) {
|
||||
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
return;
|
||||
}
|
||||
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
|
||||
RunBenchmark(array_size);
|
||||
}
|
||||
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "mem_pools_performance_common.hh"
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class MemPoolCreateBenchmark : public Benchmark<MemPoolCreateBenchmark> {
|
||||
public:
|
||||
void operator()() {
|
||||
hipMemPool_t mem_pool{nullptr};
|
||||
hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
|
||||
}
|
||||
|
||||
REQUIRE(mem_pool != nullptr);
|
||||
HIP_CHECK(hipMemPoolDestroy(mem_pool));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark() {
|
||||
MemPoolCreateBenchmark benchmark;
|
||||
benchmark.Run();
|
||||
}
|
||||
|
||||
/**
|
||||
* @warning **MemPool APIs are not fully implemented within current version
|
||||
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
|
||||
* Therefore, all tests related to MemPool APIs are implemented without formal
|
||||
* verification and will be verified once HIP fully supports MemPool APIs.**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipMemPoolCreate`.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipMemPoolCreate.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Device supports memory pools
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipMemPoolCreate") {
|
||||
if (!AreMemPoolsSupported(0)) {
|
||||
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
return;
|
||||
}
|
||||
RunBenchmark();
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "mem_pools_performance_common.hh"
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class MemPoolDestroyBenchmark : public Benchmark<MemPoolDestroyBenchmark> {
|
||||
public:
|
||||
void operator()() {
|
||||
hipMemPool_t mem_pool{nullptr};
|
||||
hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
|
||||
HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipMemPoolDestroy(mem_pool));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark() {
|
||||
MemPoolDestroyBenchmark benchmark;
|
||||
benchmark.Run();
|
||||
}
|
||||
|
||||
/**
|
||||
* @warning **MemPool APIs are not fully implemented within current version
|
||||
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
|
||||
* Therefore, all tests related to MemPool APIs are implemented without formal
|
||||
* verification and will be verified once HIP fully supports MemPool APIs.**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Creates new mem pool.
|
||||
* - Executes `hipMemPoolDestroy`.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipMemPoolDestroy.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Device supports memory pools
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipMemPoolDestroy") {
|
||||
if (!AreMemPoolsSupported(0)) {
|
||||
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
return;
|
||||
}
|
||||
RunBenchmark();
|
||||
}
|
||||
@@ -0,0 +1,84 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "mem_pools_performance_common.hh"
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class MemPoolExportPointerBenchmark : public Benchmark<MemPoolExportPointerBenchmark> {
|
||||
public:
|
||||
void operator()(const size_t array_size) {
|
||||
float* device_ptr{nullptr};
|
||||
hipMemPool_t mem_pool{nullptr};
|
||||
hipMemPoolPtrExportData exp_data;
|
||||
|
||||
hipMemPoolProps props = CreateMemPoolProps(0, kHandleType);
|
||||
HIP_CHECK(hipMemPoolCreate(&mem_pool, &props));
|
||||
HIP_CHECK(hipMallocFromPoolAsync(&device_ptr, array_size * sizeof(float), mem_pool, nullptr));
|
||||
HIP_CHECK(hipStreamSynchronize(nullptr));
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipMemPoolExportPointer(&exp_data, device_ptr));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipFreeAsync(device_ptr, nullptr));
|
||||
HIP_CHECK(hipMemPoolDestroy(mem_pool));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark(const size_t array_size) {
|
||||
MemPoolExportPointerBenchmark benchmark;
|
||||
benchmark.AddSectionName(std::to_string(array_size));
|
||||
benchmark.Run(array_size);
|
||||
}
|
||||
|
||||
/**
|
||||
* @warning **MemPool APIs are not fully implemented within current version
|
||||
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
|
||||
* Therefore, all tests related to MemPool APIs are implemented without formal
|
||||
* verification and will be verified once HIP fully supports MemPool APIs.**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipMemPoolExportPointer`:
|
||||
* -# Allocation size:
|
||||
* - 4 KB
|
||||
* - 4 MB
|
||||
* - 16 MB
|
||||
* - Uses the same process for import and export operations.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipMemPoolExportPointer.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Device supports memory pools
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipMemPoolExportPointer") {
|
||||
if (!AreMemPoolsSupported(0)) {
|
||||
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
return;
|
||||
}
|
||||
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
|
||||
RunBenchmark(array_size);
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "mem_pools_performance_common.hh"
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class MemPoolExportToShareableHandleBenchmark : public Benchmark<MemPoolExportToShareableHandleBenchmark> {
|
||||
public:
|
||||
void operator()() {
|
||||
hipMemPool_t mem_pool{nullptr};
|
||||
int share_handle;
|
||||
|
||||
hipMemPoolProps props = CreateMemPoolProps(0, kHandleType);
|
||||
HIP_CHECK(hipMemPoolCreate(&mem_pool, &props));
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipMemPoolExportToShareableHandle(&share_handle, mem_pool, kHandleType, 0));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemPoolDestroy(mem_pool));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark() {
|
||||
MemPoolExportToShareableHandleBenchmark benchmark;
|
||||
benchmark.Run();
|
||||
}
|
||||
|
||||
/**
|
||||
* @warning **MemPool APIs are not fully implemented within current version
|
||||
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
|
||||
* Therefore, all tests related to MemPool APIs are implemented without formal
|
||||
* verification and will be verified once HIP fully supports MemPool APIs.**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipMemPoolExportToShareableHandle`.
|
||||
* - Uses the same process for import and export operations.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipMemPoolExportToShareableHandle.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Device supports memory pools
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipMemPoolExportToShareableHandle") {
|
||||
if (!AreMemPoolsSupported(0)) {
|
||||
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
return;
|
||||
}
|
||||
RunBenchmark();
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "mem_pools_performance_common.hh"
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class MemPoolGetAccessBenchmark : public Benchmark<MemPoolGetAccessBenchmark> {
|
||||
public:
|
||||
void operator()() {
|
||||
hipMemPool_t mem_pool{nullptr};
|
||||
hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
|
||||
HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
|
||||
|
||||
hipMemAccessFlags flags = hipMemAccessFlagsProtNone;
|
||||
hipMemLocation location = {
|
||||
hipMemLocationTypeDevice,
|
||||
0
|
||||
};
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipMemPoolGetAccess(&flags, mem_pool, location));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemPoolDestroy(mem_pool));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark() {
|
||||
MemPoolGetAccessBenchmark benchmark;
|
||||
benchmark.Run();
|
||||
}
|
||||
|
||||
/**
|
||||
* @warning **MemPool APIs are not fully implemented within current version
|
||||
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
|
||||
* Therefore, all tests related to MemPool APIs are implemented without formal
|
||||
* verification and will be verified once HIP fully supports MemPool APIs.**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipMemPoolGetAccess`.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipMemPoolGetAccess.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Device supports memory pools
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipMemPoolGetAccess") {
|
||||
if (!AreMemPoolsSupported(0)) {
|
||||
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
return;
|
||||
}
|
||||
RunBenchmark();
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "mem_pools_performance_common.hh"
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class MemPoolGetAttributeBenchmark : public Benchmark<MemPoolGetAttributeBenchmark> {
|
||||
public:
|
||||
void operator()(const hipMemPoolAttr attribute) {
|
||||
hipMemPool_t mem_pool{nullptr};
|
||||
hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
|
||||
HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
|
||||
|
||||
uint64_t value{0};
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attribute, &value));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemPoolDestroy(mem_pool));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark(const hipMemPoolAttr attribute) {
|
||||
MemPoolGetAttributeBenchmark benchmark;
|
||||
benchmark.AddSectionName(GetMemPoolAttrSectionName(attribute));
|
||||
benchmark.Run(attribute);
|
||||
}
|
||||
|
||||
/**
|
||||
* @warning **MemPool APIs are not fully implemented within current version
|
||||
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
|
||||
* Therefore, all tests related to MemPool APIs are implemented without formal
|
||||
* verification and will be verified once HIP fully supports MemPool APIs.**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipMemPoolGetAttribute`:
|
||||
* -# Supported attributes:
|
||||
* - `hipMemPoolAttrReleaseThreshold`
|
||||
* - `hipMemPoolReuseFollowEventDependencies`
|
||||
* - `hipMemPoolReuseAllowOpportunistic`
|
||||
* - `hipMemPoolReuseAllowInternalDependencies`
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipMemPoolGetAttribute.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Device supports memory pools
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipMemPoolGetAttribute") {
|
||||
if (!AreMemPoolsSupported(0)) {
|
||||
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
return;
|
||||
}
|
||||
hipMemPoolAttr attribute = GENERATE(hipMemPoolAttrReleaseThreshold,
|
||||
hipMemPoolReuseFollowEventDependencies,
|
||||
hipMemPoolReuseAllowOpportunistic,
|
||||
hipMemPoolReuseAllowInternalDependencies);
|
||||
RunBenchmark(attribute);
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "mem_pools_performance_common.hh"
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class MemPoolImportFromShareableHandleBenchmark : public Benchmark<MemPoolImportFromShareableHandleBenchmark> {
|
||||
public:
|
||||
void operator()() {
|
||||
hipMemPool_t mem_pool{nullptr};
|
||||
int share_handle;
|
||||
|
||||
hipMemPoolProps props = CreateMemPoolProps(0, kHandleType);
|
||||
HIP_CHECK(hipMemPoolCreate(&mem_pool, &props));
|
||||
HIP_CHECK(hipMemPoolExportToShareableHandle(&share_handle, mem_pool, kHandleType, 0));
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipMemPoolImportFromShareableHandle(&mem_pool, &share_handle, kHandleType, 0));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemPoolDestroy(mem_pool));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark() {
|
||||
MemPoolImportFromShareableHandleBenchmark benchmark;
|
||||
benchmark.Run();
|
||||
}
|
||||
|
||||
/**
|
||||
* @warning **MemPool APIs are not fully implemented within current version
|
||||
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
|
||||
* Therefore, all tests related to MemPool APIs are implemented without formal
|
||||
* verification and will be verified once HIP fully supports MemPool APIs.**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipMemPoolImportFromShareableHandle`.
|
||||
* - Uses the same process for import and export operations.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipMemPoolImportFromShareableHandle.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Device supports memory pools
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipMemPoolImportFromShareableHandle") {
|
||||
if (!AreMemPoolsSupported(0)) {
|
||||
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
return;
|
||||
}
|
||||
RunBenchmark();
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "mem_pools_performance_common.hh"
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class MemPoolImportPointerBenchmark : public Benchmark<MemPoolImportPointerBenchmark> {
|
||||
public:
|
||||
void operator()(const size_t array_size) {
|
||||
float* device_ptr{nullptr};
|
||||
float* device_ptr_import{nullptr};
|
||||
hipMemPool_t mem_pool{nullptr};
|
||||
hipMemPoolPtrExportData exp_data;
|
||||
|
||||
hipMemPoolProps props = CreateMemPoolProps(0, kHandleType);
|
||||
HIP_CHECK(hipMemPoolCreate(&mem_pool, &props));
|
||||
HIP_CHECK(hipMallocFromPoolAsync(&device_ptr, array_size * sizeof(float), mem_pool, nullptr));
|
||||
HIP_CHECK(hipStreamSynchronize(nullptr));
|
||||
HIP_CHECK(hipMemPoolExportPointer(&exp_data, device_ptr));
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipMemPoolImportPointer(reinterpret_cast<void**>(device_ptr_import), mem_pool, &exp_data));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipFree(device_ptr));
|
||||
HIP_CHECK(hipFree(device_ptr_import));
|
||||
HIP_CHECK(hipMemPoolDestroy(mem_pool));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark(const size_t array_size) {
|
||||
MemPoolImportPointerBenchmark benchmark;
|
||||
benchmark.AddSectionName(std::to_string(array_size));
|
||||
benchmark.Run(array_size);
|
||||
}
|
||||
|
||||
/**
|
||||
* @warning **MemPool APIs are not fully implemented within current version
|
||||
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
|
||||
* Therefore, all tests related to MemPool APIs are implemented without formal
|
||||
* verification and will be verified once HIP fully supports MemPool APIs.**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipMemPoolImportPointer`:
|
||||
* -# Allocation size:
|
||||
* - 4 KB
|
||||
* - 4 MB
|
||||
* - 16 MB
|
||||
* - Uses the same process for import and export operations.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipMemPoolImportPointer.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Device supports memory pools
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipMemPoolImportPointer") {
|
||||
if (!AreMemPoolsSupported(0)) {
|
||||
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
return;
|
||||
}
|
||||
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
|
||||
RunBenchmark(array_size);
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "mem_pools_performance_common.hh"
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class MemPoolSetAccessBenchmark : public Benchmark<MemPoolSetAccessBenchmark> {
|
||||
public:
|
||||
void operator()() {
|
||||
hipMemPool_t mem_pool{nullptr};
|
||||
hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
|
||||
HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
|
||||
|
||||
hipMemAccessDesc desc_list = {
|
||||
{
|
||||
hipMemLocationTypeDevice,
|
||||
0
|
||||
},
|
||||
hipMemAccessFlagsProtReadWrite
|
||||
};
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipMemPoolSetAccess(mem_pool, &desc_list, 1));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemPoolDestroy(mem_pool));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark() {
|
||||
MemPoolSetAccessBenchmark benchmark;
|
||||
benchmark.Run();
|
||||
}
|
||||
|
||||
/**
|
||||
* @warning **MemPool APIs are not fully implemented within current version
|
||||
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
|
||||
* Therefore, all tests related to MemPool APIs are implemented without formal
|
||||
* verification and will be verified once HIP fully supports MemPool APIs.**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipMemPoolSetAccess` with `hipMemAccessFlagsProtReadWrite`.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipMemPoolSetAccess.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Device supports memory pools
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipMemPoolSetAccess") {
|
||||
if (!AreMemPoolsSupported(0)) {
|
||||
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
return;
|
||||
}
|
||||
RunBenchmark();
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "mem_pools_performance_common.hh"
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class MemPoolSetAttributeBenchmark : public Benchmark<MemPoolSetAttributeBenchmark> {
|
||||
public:
|
||||
void operator()(const hipMemPoolAttr attribute) {
|
||||
hipMemPool_t mem_pool{nullptr};
|
||||
hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
|
||||
HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
|
||||
|
||||
int value{0};
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attribute, &value));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemPoolDestroy(mem_pool));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark(const hipMemPoolAttr attribute) {
|
||||
MemPoolSetAttributeBenchmark benchmark;
|
||||
benchmark.AddSectionName(GetMemPoolAttrSectionName(attribute));
|
||||
benchmark.Run(attribute);
|
||||
}
|
||||
|
||||
/**
|
||||
* @warning **MemPool APIs are not fully implemented within current version
|
||||
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
|
||||
* Therefore, all tests related to MemPool APIs are implemented without formal
|
||||
* verification and will be verified once HIP fully supports MemPool APIs.**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipMemPoolSetAttribute`:
|
||||
* -# Supported attributes:
|
||||
* - `hipMemPoolAttrReleaseThreshold`
|
||||
* - `hipMemPoolReuseFollowEventDependencies`
|
||||
* - `hipMemPoolReuseAllowOpportunistic`
|
||||
* - `hipMemPoolReuseAllowInternalDependencies`
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipMemPoolSetAttribute.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Device supports memory pools
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipMemPoolSetAttribute") {
|
||||
if (!AreMemPoolsSupported(0)) {
|
||||
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
return;
|
||||
}
|
||||
hipMemPoolAttr attribute = GENERATE(hipMemPoolAttrReleaseThreshold,
|
||||
hipMemPoolReuseFollowEventDependencies,
|
||||
hipMemPoolReuseAllowOpportunistic,
|
||||
hipMemPoolReuseAllowInternalDependencies);
|
||||
RunBenchmark(attribute);
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "mem_pools_performance_common.hh"
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class MemPoolTrimToBenchmark : public Benchmark<MemPoolTrimToBenchmark> {
|
||||
public:
|
||||
void operator()(const size_t min_bytes_to_hold) {
|
||||
hipMemPool_t mem_pool{nullptr};
|
||||
hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
|
||||
HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipMemPoolTrimTo(mem_pool, min_bytes_to_hold));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemPoolDestroy(mem_pool));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark(const size_t min_bytes_to_hold) {
|
||||
MemPoolTrimToBenchmark benchmark;
|
||||
benchmark.AddSectionName(std::to_string(min_bytes_to_hold));
|
||||
benchmark.Run(min_bytes_to_hold);
|
||||
}
|
||||
|
||||
/**
|
||||
* @warning **MemPool APIs are not fully implemented within current version
|
||||
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
|
||||
* Therefore, all tests related to MemPool APIs are implemented without formal
|
||||
* verification and will be verified once HIP fully supports MemPool APIs.**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipMemPoolTrimTo`:
|
||||
* -# Minimum bytes to hold:
|
||||
* - 4 KB
|
||||
* - 4 MB
|
||||
* - 16 MB
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipMemPoolTrimTo.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Device supports memory pools
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipMemPoolTrimTo") {
|
||||
if (!AreMemPoolsSupported(0)) {
|
||||
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
|
||||
"attribute. Hence skipping the testing with Pass result.\n");
|
||||
return;
|
||||
}
|
||||
size_t min_bytes_to_hold = GENERATE(4_KB, 4_MB, 16_MB);
|
||||
RunBenchmark(min_bytes_to_hold);
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <performance_common.hh>
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
void Callback(hipStream_t stream, hipError_t status, void* user_data) {}
|
||||
|
||||
class StreamAddCallbackBenchmark : public Benchmark<StreamAddCallbackBenchmark> {
|
||||
public:
|
||||
void operator()() {
|
||||
const StreamGuard stream_guard{Streams::created};
|
||||
const hipStream_t stream = stream_guard.stream();
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipStreamAddCallback(stream, Callback, nullptr, 0));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark() {
|
||||
StreamAddCallbackBenchmark benchmark;
|
||||
benchmark.Run();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipStreamAddCallback` on the created stream.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipStreamAddCallback.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipStreamAddCallback") {
|
||||
RunBenchmark();
|
||||
}
|
||||
@@ -0,0 +1,269 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <performance_common.hh>
|
||||
#include <resource_guards.hh>
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
* Contains performance tests for all hipStream related APIs
|
||||
*/
|
||||
|
||||
class HipDeviceGetStreamPriorityRangeBenchmark : public Benchmark<HipDeviceGetStreamPriorityRangeBenchmark> {
|
||||
public:
|
||||
void operator()() {
|
||||
int priority_min, priority_max;
|
||||
TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipDeviceGetStreamPriorityRange(&priority_min, &priority_max)); }
|
||||
}
|
||||
};
|
||||
|
||||
class HipStreamQueryBenchmark : public Benchmark<HipStreamQueryBenchmark> {
|
||||
public:
|
||||
void operator()(bool perform_work) {
|
||||
hipError_t error;
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
void *dptr;
|
||||
|
||||
if(perform_work) {
|
||||
HIP_CHECK(hipMallocAsync(&dptr, 2048 * 4, stream));
|
||||
}
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) { error = hipStreamQuery(stream); }
|
||||
|
||||
if(perform_work) {
|
||||
HIP_CHECK(hipFreeAsync(dptr, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
};
|
||||
|
||||
class HipStreamSynchronizeBenchmark : public Benchmark<HipStreamSynchronizeBenchmark> {
|
||||
public:
|
||||
void operator()() {
|
||||
hipError_t error;
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) { error = hipStreamSynchronize(stream); }
|
||||
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
};
|
||||
|
||||
class HipStreamDestroyBenchmark : public Benchmark<HipStreamDestroyBenchmark> {
|
||||
public:
|
||||
void operator()() {
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamDestroy(stream)); }
|
||||
}
|
||||
};
|
||||
|
||||
class HipStreamCreateBenchmark : public Benchmark<HipStreamCreateBenchmark> {
|
||||
public:
|
||||
void operator()() {
|
||||
hipStream_t stream;
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamCreate(&stream)); }
|
||||
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
};
|
||||
|
||||
class HipStreamCreateWithPriorityBenchmark : public Benchmark<HipStreamCreateWithPriorityBenchmark> {
|
||||
public:
|
||||
void operator()(unsigned int flag) {
|
||||
hipStream_t stream;
|
||||
int priority_min, priority_max, priority_mid;
|
||||
|
||||
HIP_CHECK(hipDeviceGetStreamPriorityRange(&priority_min, &priority_max));
|
||||
priority_mid = (priority_max + priority_min) / 2;
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamCreateWithPriority(&stream, flag, priority_mid)); }
|
||||
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
static std::string GetStreamCreateFlagName(unsigned flag) {
|
||||
switch (flag) {
|
||||
case hipStreamDefault:
|
||||
return "hipStreamDefault";
|
||||
case hipStreamNonBlocking:
|
||||
return "hipStreamNonBlocking";
|
||||
default:
|
||||
return "flag combination";
|
||||
}
|
||||
}
|
||||
|
||||
class HipStreamCreateWithFlagsBenchmark : public Benchmark<HipStreamCreateWithFlagsBenchmark> {
|
||||
public:
|
||||
void operator()(unsigned int flag) {
|
||||
hipStream_t stream;
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamCreateWithFlags(&stream, flag)); }
|
||||
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipStreamCreate`:
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipStreamBasic.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipStreamCreate") {
|
||||
HipStreamCreateBenchmark benchmark;
|
||||
benchmark.Run();
|
||||
}
|
||||
|
||||
static void RunBenchmark(unsigned flag) {
|
||||
HipStreamCreateWithFlagsBenchmark benchmark;
|
||||
benchmark.AddSectionName(GetStreamCreateFlagName(flag));
|
||||
benchmark.Run(flag);
|
||||
}
|
||||
|
||||
static void RunBenchmarkWithPriority(unsigned flag) {
|
||||
HipStreamCreateWithPriorityBenchmark benchmark;
|
||||
benchmark.AddSectionName(GetStreamCreateFlagName(flag));
|
||||
benchmark.Run(flag);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipStreamCreateWithFlags` with all flags:
|
||||
* -# Flags
|
||||
* - hipStreamDefault
|
||||
* - hipStreamNonBlocking
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipStreamBasic.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipStreamCreateWithFlags") {
|
||||
const auto flag = GENERATE(hipStreamDefault, hipStreamNonBlocking);
|
||||
RunBenchmark(flag);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipStreamCreateWithPriority` with all flags:
|
||||
* -# Flags
|
||||
* - hipStreamDefault
|
||||
* - hipStreamNonBlocking
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipStreamBasic.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipStreamCreateWithPriority") {
|
||||
const auto flag = GENERATE(hipStreamDefault, hipStreamNonBlocking);
|
||||
RunBenchmarkWithPriority(flag);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipStreamDestroy`:
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipStreamBasic.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipStreamDestroy") {
|
||||
HipStreamDestroyBenchmark benchmark;
|
||||
benchmark.Run();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipDeviceGetStreamPriorityRange`:
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipStreamBasic.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipDeviceGetStreamPriorityRange") {
|
||||
HipDeviceGetStreamPriorityRangeBenchmark benchmark;
|
||||
benchmark.Run();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipStreamQuery`:
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipStreamBasic.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipStreamQuery") {
|
||||
const auto perform_work = GENERATE(true, false);
|
||||
HipStreamQueryBenchmark benchmark;
|
||||
if(perform_work) {
|
||||
benchmark.AddSectionName("stream with work");
|
||||
} else {
|
||||
benchmark.AddSectionName("stream without work");
|
||||
}
|
||||
benchmark.Run(perform_work);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipDeviceGetStreamPriorityRange`:
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipStreamBasic.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipStreamSynchronize") {
|
||||
HipStreamSynchronizeBenchmark benchmark;
|
||||
benchmark.Run();
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <performance_common.hh>
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class StreamGetFlagsBenchmark : public Benchmark<StreamGetFlagsBenchmark> {
|
||||
public:
|
||||
void operator()(unsigned int expected_flag) {
|
||||
unsigned int returned_flags{};
|
||||
hipStream_t stream;
|
||||
|
||||
HIP_CHECK(hipStreamCreateWithFlags(&stream, expected_flag));
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipStreamGetFlags(stream, &returned_flags))
|
||||
}
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark(unsigned int expected_flag) {
|
||||
StreamGetFlagsBenchmark benchmark;
|
||||
switch (expected_flag) {
|
||||
case hipStreamDefault:
|
||||
benchmark.AddSectionName("hipStreamDefault");
|
||||
break;
|
||||
case hipStreamNonBlocking:
|
||||
benchmark.AddSectionName("hipStreamNonBlocking");
|
||||
break;
|
||||
default:
|
||||
benchmark.AddSectionName("unknown flag type");
|
||||
}
|
||||
benchmark.Run(expected_flag);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipStreamGetFlags`:
|
||||
* -# Flags:
|
||||
* - `hipStreamDefault`
|
||||
* - `hipStreamNonBlocking`
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipStreamGetFlags.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipStreamGetFlags") {
|
||||
unsigned int expected_flag = GENERATE(hipStreamDefault, hipStreamNonBlocking);
|
||||
RunBenchmark(expected_flag);
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <performance_common.hh>
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class StreamGetPriorityBenchmark : public Benchmark<StreamGetPriorityBenchmark> {
|
||||
public:
|
||||
void operator()(Streams stream_type) {
|
||||
const StreamGuard stream_guard{stream_type};
|
||||
const hipStream_t stream = stream_guard.stream();
|
||||
|
||||
int priority{};
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipStreamGetPriority(stream, &priority));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark(Streams stream_type) {
|
||||
StreamGetPriorityBenchmark benchmark;
|
||||
switch (stream_type) {
|
||||
case Streams::nullstream:
|
||||
benchmark.AddSectionName("null stream");
|
||||
break;
|
||||
case Streams::created:
|
||||
benchmark.AddSectionName("created");
|
||||
break;
|
||||
default:
|
||||
benchmark.AddSectionName("per thread stream");
|
||||
}
|
||||
benchmark.Run(stream_type);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipStreamGetPriority`:
|
||||
* -# Stream types:
|
||||
* - `null`
|
||||
* - created
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipStreamGetPriority.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipStreamGetPriority") {
|
||||
Streams stream_type = GENERATE(Streams::nullstream, Streams::created);
|
||||
RunBenchmark(stream_type);
|
||||
}
|
||||
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <performance_common.hh>
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
class StreamWaitEventBenchmark : public Benchmark<StreamWaitEventBenchmark> {
|
||||
public:
|
||||
void operator()(Streams stream_type) {
|
||||
const StreamGuard stream_guard{stream_type};
|
||||
const hipStream_t stream = stream_guard.stream();
|
||||
hipEvent_t wait_event{nullptr};
|
||||
|
||||
HIP_CHECK(hipEventCreate(&wait_event));
|
||||
REQUIRE(wait_event != nullptr);
|
||||
HIP_CHECK(hipEventRecord(wait_event, stream));
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipStreamWaitEvent(stream, wait_event, 0));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
}
|
||||
HIP_CHECK(hipEventDestroy(wait_event));
|
||||
}
|
||||
};
|
||||
|
||||
static void RunBenchmark(Streams stream_type) {
|
||||
StreamWaitEventBenchmark benchmark{};
|
||||
switch (stream_type) {
|
||||
case Streams::nullstream:
|
||||
benchmark.AddSectionName("null stream");
|
||||
break;
|
||||
case Streams::created:
|
||||
benchmark.AddSectionName("created");
|
||||
break;
|
||||
default:
|
||||
benchmark.AddSectionName("per thread stream");
|
||||
}
|
||||
benchmark.Run(stream_type);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipStreamWaitEvent`:
|
||||
* -# Stream types:
|
||||
* - `null`
|
||||
* - created
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipStreamWaitEvent.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipStreamWaitEvent") {
|
||||
Streams stream_type = GENERATE(Streams::nullstream, Streams::created);
|
||||
RunBenchmark(stream_type);
|
||||
}
|
||||
@@ -0,0 +1,172 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <performance_common.hh>
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
static int IsStreamWaitValueSupported(int device_id) {
|
||||
int wait_value_supported = 0;
|
||||
#if HT_AMD
|
||||
HIP_CHECK(hipDeviceGetAttribute(&wait_value_supported, hipDeviceAttributeCanUseStreamWaitValue,
|
||||
device_id));
|
||||
#else
|
||||
cuDeviceGetAttribute(&wait_value_supported, CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS,
|
||||
device_id);
|
||||
#endif
|
||||
return wait_value_supported;
|
||||
}
|
||||
|
||||
class StreamWaitValue32Benchmark : public Benchmark<StreamWaitValue32Benchmark> {
|
||||
public:
|
||||
void operator()(const size_t array_size, unsigned int flag) {
|
||||
const StreamGuard stream_guard{Streams::created};
|
||||
const hipStream_t stream = stream_guard.stream();
|
||||
uint32_t* value_ptr;
|
||||
uint32_t value{0};
|
||||
if (flag == hipStreamWaitValueAnd) {
|
||||
value = 1;
|
||||
}
|
||||
HIP_CHECK(hipMalloc(&value_ptr, sizeof(uint32_t) * array_size));
|
||||
HIP_CHECK(hipMemset(value_ptr, value, sizeof(uint32_t) * array_size));
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipStreamWaitValue32(stream, value_ptr, value, flag));
|
||||
}
|
||||
HIP_CHECK(hipFree(value_ptr));
|
||||
}
|
||||
};
|
||||
|
||||
class StreamWaitValue64Benchmark : public Benchmark<StreamWaitValue64Benchmark> {
|
||||
public:
|
||||
void operator()(const size_t array_size, unsigned int flag) {
|
||||
const StreamGuard stream_guard{Streams::created};
|
||||
const hipStream_t stream = stream_guard.stream();
|
||||
uint64_t* value_ptr;
|
||||
uint64_t value{0};
|
||||
if (flag == hipStreamWaitValueAnd) {
|
||||
value = 1;
|
||||
}
|
||||
HIP_CHECK(hipMalloc(&value_ptr, sizeof(uint64_t) * array_size));
|
||||
HIP_CHECK(hipMemset(value_ptr, value, sizeof(uint64_t) * array_size));
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) {
|
||||
HIP_CHECK(hipStreamWaitValue64(stream, value_ptr, value, flag));
|
||||
}
|
||||
HIP_CHECK(hipFree(value_ptr));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename WaitValueBenchmark>
|
||||
static void RunBenchmark(const size_t array_size, unsigned int flag) {
|
||||
WaitValueBenchmark benchmark;
|
||||
benchmark.AddSectionName(std::to_string(array_size));
|
||||
switch (flag) {
|
||||
case hipStreamWaitValueGte:
|
||||
benchmark.AddSectionName("greater than or equal");
|
||||
break;
|
||||
case hipStreamWaitValueEq:
|
||||
benchmark.AddSectionName("equal");
|
||||
break;
|
||||
case hipStreamWaitValueAnd:
|
||||
benchmark.AddSectionName("logical and");
|
||||
break;
|
||||
case hipStreamWaitValueNor:
|
||||
benchmark.AddSectionName("logical nor");
|
||||
break;
|
||||
default:
|
||||
benchmark.AddSectionName("unknown flag");
|
||||
}
|
||||
benchmark.Run(array_size, flag);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipStreamWaitValue32` for different array sizes:
|
||||
* -# 4 KB
|
||||
* -# 4 MB
|
||||
* -# 16 MB
|
||||
* - Uses different flag types for wait criteria:
|
||||
* -# Greater than or equal
|
||||
* -# Equal
|
||||
* -# Logical AND
|
||||
* -# Logical OR
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipStreamWaitValue.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Device supports Stream Wait Value operations
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipStreamWaitValue32") {
|
||||
#if HT_AMD
|
||||
if (!IsStreamWaitValueSupported(0)) {
|
||||
HipTest::HIP_SKIP_TEST(
|
||||
"GPU 0 doesn't support hipStreamWaitValue32() function. "
|
||||
"Hence skipping the testing with Pass result.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
|
||||
unsigned int flag = GENERATE(hipStreamWaitValueGte, hipStreamWaitValueEq, hipStreamWaitValueAnd,
|
||||
hipStreamWaitValueNor);
|
||||
RunBenchmark<StreamWaitValue32Benchmark>(array_size, flag);
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipStreamWaitValue64`:
|
||||
* -# Allocation size:
|
||||
* - 4 KB
|
||||
* - 4 MB
|
||||
* - 16 MB
|
||||
* -# Wait type:
|
||||
* - Greater than or equal
|
||||
* - Equal
|
||||
* - Logical AND
|
||||
* - Logical OR
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipStreamWaitValue.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - Device supports Stream Wait Value operations
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipStreamWaitValue64") {
|
||||
if (!IsStreamWaitValueSupported(0)) {
|
||||
HipTest::HIP_SKIP_TEST(
|
||||
"GPU 0 doesn't support hipStreamWaitValue64() function. "
|
||||
"Hence skipping the testing with Pass result.\n");
|
||||
return;
|
||||
}
|
||||
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
|
||||
unsigned int flag = GENERATE(hipStreamWaitValueGte, hipStreamWaitValueEq, hipStreamWaitValueAnd,
|
||||
hipStreamWaitValueNor);
|
||||
RunBenchmark<StreamWaitValue64Benchmark>(array_size, flag);
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <performance_common.hh>
|
||||
|
||||
/**
|
||||
* @addtogroup stream stream
|
||||
* @{
|
||||
* @ingroup PerformanceTest
|
||||
*/
|
||||
|
||||
#if HT_NVIDIA
|
||||
static int IsStreamWriteValueSupported(int device_id) {
|
||||
int write_value_supported = 0;
|
||||
|
||||
cuDeviceGetAttribute(&write_value_supported, CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS,
|
||||
device_id);
|
||||
return write_value_supported;
|
||||
}
|
||||
#endif
|
||||
|
||||
class StreamWriteValue32Benchmark : public Benchmark<StreamWriteValue32Benchmark> {
|
||||
public:
|
||||
void operator()(const size_t array_size) {
|
||||
const StreamGuard stream_guard{Streams::created};
|
||||
const hipStream_t stream = stream_guard.stream();
|
||||
uint32_t* value_ptr;
|
||||
uint32_t value{0};
|
||||
HIP_CHECK(hipMalloc(&value_ptr, sizeof(uint32_t) * array_size));
|
||||
HIP_CHECK(hipMemset(value_ptr, value, sizeof(uint32_t) * array_size));
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamWriteValue32(stream, value_ptr, value, 0)); }
|
||||
HIP_CHECK(hipFree(value_ptr));
|
||||
}
|
||||
};
|
||||
|
||||
class StreamWriteValue64Benchmark : public Benchmark<StreamWriteValue64Benchmark> {
|
||||
public:
|
||||
void operator()(const size_t array_size) {
|
||||
const StreamGuard stream_guard{Streams::created};
|
||||
const hipStream_t stream = stream_guard.stream();
|
||||
uint64_t* value_ptr;
|
||||
uint64_t value{0};
|
||||
HIP_CHECK(hipMalloc(&value_ptr, sizeof(uint64_t) * array_size));
|
||||
HIP_CHECK(hipMemset(value_ptr, value, sizeof(uint64_t) * array_size));
|
||||
|
||||
TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamWriteValue64(stream, value_ptr, value, 0)); }
|
||||
HIP_CHECK(hipFree(value_ptr));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename WriteValueBenchmark> static void RunBenchmark(const size_t array_size) {
|
||||
WriteValueBenchmark benchmark;
|
||||
benchmark.AddSectionName(std::to_string(array_size));
|
||||
benchmark.Run(array_size);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipStreamWriteValue32`:
|
||||
* -# Allocation size:
|
||||
* - 4 KB
|
||||
* - 4 MB
|
||||
* - 16 MB
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipStreamWriteValue.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipStreamWriteValue32") {
|
||||
#if HT_AMD
|
||||
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
|
||||
RunBenchmark<StreamWriteValue32Benchmark>(array_size);
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes `hipStreamWriteValue64`:
|
||||
* -# Allocation size:
|
||||
* - 4 KB
|
||||
* - 4 MB
|
||||
* - 16 MB
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - performance/stream/hipStreamWriteValue.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Performance_hipStreamWriteValue64") {
|
||||
#if HT_NVIDIA
|
||||
if (!IsStreamWriteValueSupported(0)) {
|
||||
HipTest::HIP_SKIP_TEST(
|
||||
"GPU 0 doesn't support hipStreamWriteValue64() function. "
|
||||
"Hence skipping the testing with Pass result.\n");
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
|
||||
RunBenchmark<StreamWriteValue64Benchmark>(array_size);
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <performance_common.hh>
|
||||
|
||||
#if __linux__
|
||||
static const hipMemAllocationHandleType kHandleType = hipMemHandleTypePosixFileDescriptor;
|
||||
#else
|
||||
static const hipMemAllocationHandleType kHandleType = hipMemHandleTypeWin32;
|
||||
#endif
|
||||
|
||||
static int AreMemPoolsSupported(int device_id) {
|
||||
int mem_pools_supported = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&mem_pools_supported,
|
||||
hipDeviceAttributeMemoryPoolsSupported, 0));
|
||||
return mem_pools_supported;
|
||||
}
|
||||
|
||||
static hipMemPoolProps CreateMemPoolProps(const int device_id, const hipMemAllocationHandleType handle_type) {
|
||||
hipMemPoolProps kPoolProps = {
|
||||
hipMemAllocationTypePinned,
|
||||
handle_type,
|
||||
{
|
||||
hipMemLocationTypeDevice,
|
||||
device_id
|
||||
},
|
||||
nullptr,
|
||||
{0}
|
||||
};
|
||||
|
||||
return kPoolProps;
|
||||
}
|
||||
|
||||
static std::string GetMemPoolAttrSectionName(const hipMemPoolAttr attribute) {
|
||||
switch (attribute) {
|
||||
case hipMemPoolReuseFollowEventDependencies:
|
||||
return "ReuseFollowEventDependencies";
|
||||
case hipMemPoolReuseAllowOpportunistic:
|
||||
return "ReuseAllowOpportunistic";
|
||||
case hipMemPoolReuseAllowInternalDependencies:
|
||||
return "ReuseAllowInternalDependencies";
|
||||
case hipMemPoolAttrReleaseThreshold:
|
||||
return "AttrReleaseThreshold";
|
||||
case hipMemPoolAttrReservedMemCurrent:
|
||||
return "AttrReservedMemCurrent";
|
||||
case hipMemPoolAttrReservedMemHigh:
|
||||
return "AttrReservedMemHigh";
|
||||
case hipMemPoolAttrUsedMemCurrent:
|
||||
return "AttrUsedMemCurrent";
|
||||
case hipMemPoolAttrUsedMemHigh:
|
||||
return "AttrUsedMemHigh";
|
||||
default:
|
||||
return "unknown attribute";
|
||||
}
|
||||
}
|
||||
@@ -36,11 +36,14 @@ add_subdirectory(compiler)
|
||||
add_subdirectory(errorHandling)
|
||||
add_subdirectory(cooperativeGrps)
|
||||
add_subdirectory(context)
|
||||
add_subdirectory(warp)
|
||||
add_subdirectory(dynamicLoading)
|
||||
add_subdirectory(g++)
|
||||
add_subdirectory(module)
|
||||
add_subdirectory(channelDescriptor)
|
||||
add_subdirectory(executionControl)
|
||||
add_subdirectory(vector_types)
|
||||
add_subdirectory(atomics)
|
||||
add_subdirectory(p2p)
|
||||
add_subdirectory(gcc)
|
||||
|
||||
@@ -49,5 +52,5 @@ add_subdirectory(callback)
|
||||
add_subdirectory(clock)
|
||||
# Vulkan interop APIs currently undefined for Nvidia
|
||||
add_subdirectory(vulkan_interop)
|
||||
add_subdirectory(gl_interop) # Disabled on NVIDIA due to defect - EXSWHTEC-246
|
||||
endif()
|
||||
add_subdirectory(vector_types)
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
set(TEST_SRC
|
||||
atomicExch.cc
|
||||
atomicExch_system.cc
|
||||
)
|
||||
|
||||
if(HIP_PLATFORM MATCHES "nvidia")
|
||||
set_source_files_properties(atomicExch_system.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
|
||||
hip_add_exe_to_target(NAME AtomicsTest
|
||||
TEST_SRC ${TEST_SRC}
|
||||
TEST_TARGET_NAME build_tests
|
||||
LINKER_LIBS "nvrtc -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
|
||||
elseif(HIP_PLATFORM MATCHES "amd")
|
||||
hip_add_exe_to_target(NAME AtomicsTest
|
||||
TEST_SRC ${TEST_SRC}
|
||||
TEST_TARGET_NAME build_tests
|
||||
LINKER_LIBS hiprtc)
|
||||
endif()
|
||||
|
||||
# SWDEV-435667: Below 2 tests failed in stress test on 01/12/23
|
||||
#add_test(NAME Unit_atomicExch_Negative_Parameters
|
||||
# COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
|
||||
# ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
|
||||
# atomicExch_negative_kernels.cc 40)
|
||||
#
|
||||
#add_test(NAME Unit_atomicExch_system_Negative_Parameters
|
||||
# COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
|
||||
# ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
|
||||
# atomicExch_system_negative_kernels.cc 40)
|
||||
@@ -0,0 +1,213 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "atomicExch_common.hh"
|
||||
#include "atomicExch_negative_kernels_rtc.hh"
|
||||
|
||||
/**
|
||||
* @addtogroup atomicExch atomicExch
|
||||
* @{
|
||||
* @ingroup AtomicsTest
|
||||
*/
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes a kernel wherein all threads will perform an atomic exchange in the same(compile
|
||||
* time deducible) memory location. Each thread will exchange its own grid wide linear index + 1
|
||||
* into the memory location, storing the return value into a separate output array slot
|
||||
* corresponding to it. Once complete, the union of output array and exchange memory is validated to
|
||||
* contain all values in the range [0, number_of_threads].
|
||||
*
|
||||
* - The test is run for:
|
||||
* - All overloads of atomicExch
|
||||
* - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
|
||||
* - Exchange memory located in shared memory
|
||||
* - Several grid and block dimension combinations(only one block is used for shared memory)
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/atomics/atomicExch.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
#if HT_NVIDIA
|
||||
TEMPLATE_TEST_CASE("Unit_atomicExch_Positive_Same_Address_Compile_Time", "", int, unsigned int,
|
||||
unsigned long long, float) {
|
||||
#else
|
||||
TEMPLATE_TEST_CASE("Unit_atomicExch_Positive_Same_Address_Compile_Time", "", int, unsigned int,
|
||||
unsigned long, unsigned long long, float, double) {
|
||||
#endif // HT_NVIDIA
|
||||
for (auto current = 0; current < cmd_options.iterations; ++current) {
|
||||
AtomicExchSameAddressTest<TestType, AtomicScopes::device>();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes a single kernel on a single device wherein all threads will perform an atomic
|
||||
* exchange into a runtime determined memory location. Each thread will exchange its own grid wide
|
||||
* linear index + offset into the memory location, storing the return value into a separate output
|
||||
* array slot corresponding to it. Once complete, the union of output array and exchange memory is
|
||||
* validated to contain all values in the range [0, number_of_threads +
|
||||
* number_of_exchange_memory_slots). Several memory access patterns are tested:
|
||||
* -# All threads exchange to a single memory location
|
||||
* -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
|
||||
* for indexing
|
||||
* -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
|
||||
*
|
||||
* - The test is run for:
|
||||
* - All overloads of atomicExch
|
||||
* - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
|
||||
* - Exchange memory located in shared memory
|
||||
* - Several grid and block dimension combinations(only one block is used for shared memory)
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/atomics/atomicExch.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
#if HT_NVIDIA
|
||||
TEMPLATE_TEST_CASE("Unit_atomicExch_Positive", "", int, unsigned int,
|
||||
unsigned long long, float) {
|
||||
#else
|
||||
TEMPLATE_TEST_CASE("Unit_atomicExch_Positive", "", int, unsigned int, unsigned long,
|
||||
unsigned long long, float, double) {
|
||||
#endif // HT_NVIDIA
|
||||
int warp_size = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
|
||||
const auto cache_line_size = 128u;
|
||||
|
||||
for (auto current = 0; current < cmd_options.iterations; ++current) {
|
||||
DYNAMIC_SECTION("Same address " << current) {
|
||||
AtomicExchSingleDeviceSingleKernelTest<TestType, AtomicScopes::device>(1, sizeof(TestType));
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Adjacent addresses " << current) {
|
||||
AtomicExchSingleDeviceSingleKernelTest<TestType, AtomicScopes::device>(warp_size,
|
||||
sizeof(TestType));
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Scattered addresses " << current) {
|
||||
AtomicExchSingleDeviceSingleKernelTest<TestType, AtomicScopes::device>(warp_size,
|
||||
cache_line_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes a kernel two times concurrently on a single device wherein all threads will perform
|
||||
* an atomic exchange into a runtime determined memory location. Each thread will exchange its own
|
||||
* grid wide linear index + offset into the memory location, storing the return value into a
|
||||
* separate output array slot corresponding to it. Once complete, the union of output array and
|
||||
* exchange memory is validated to contain all values in the range [0, number_of_threads +
|
||||
* number_of_exchange_memory_slots). Several memory access patterns are tested:
|
||||
* -# All threads exchange to a single memory location
|
||||
* -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
|
||||
* for indexing
|
||||
* -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
|
||||
*
|
||||
* - The test is run for:
|
||||
* - All overloads of atomicExch
|
||||
* - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
|
||||
* - Several grid and block dimension combinations
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/atomics/atomicExch.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
#if HT_NVIDIA
|
||||
TEMPLATE_TEST_CASE("Unit_atomicExch_Positive_Multi_Kernel", "", int, unsigned int,
|
||||
unsigned long long, float) {
|
||||
#else
|
||||
TEMPLATE_TEST_CASE("Unit_atomicExch_Positive_Multi_Kernel", "", int, unsigned int, unsigned long,
|
||||
unsigned long long, float, double) {
|
||||
#endif // HT_NVIDIA
|
||||
int warp_size = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
|
||||
const auto cache_line_size = 128u;
|
||||
|
||||
for (auto current = 0; current < cmd_options.iterations; ++current) {
|
||||
DYNAMIC_SECTION("Same address " << current) {
|
||||
AtomicExchSingleDeviceMultipleKernelTest<TestType, AtomicScopes::device>(2, 1,
|
||||
sizeof(TestType));
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Adjacent addresses " << current) {
|
||||
AtomicExchSingleDeviceMultipleKernelTest<TestType, AtomicScopes::device>(2, warp_size,
|
||||
sizeof(TestType));
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Scattered addresses " << current) {
|
||||
AtomicExchSingleDeviceMultipleKernelTest<TestType, AtomicScopes::device>(2, warp_size,
|
||||
cache_line_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - RTCs kernels that pass combinations of arguments of invalid types for all overloads of
|
||||
* atomicExch
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/atomics/atomicExch.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Unit_atomicExch_Negative_Parameters_RTC") {
|
||||
hiprtcProgram program{};
|
||||
|
||||
const auto program_source = GENERATE(kAtomicExchInt, kAtomicExchUnsignedInt, kAtomicExchULL,
|
||||
kAtomicExchFloat, kAtomicExchDouble);
|
||||
HIPRTC_CHECK(
|
||||
hiprtcCreateProgram(&program, program_source, "atomicExch_negative.cc", 0, nullptr, nullptr));
|
||||
hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
|
||||
|
||||
// Get the compile log and count compiler error messages
|
||||
size_t log_size{};
|
||||
HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
|
||||
std::string log(log_size, ' ');
|
||||
HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
|
||||
int error_count{0};
|
||||
|
||||
int expected_error_count{8};
|
||||
std::string error_message{"error:"};
|
||||
|
||||
size_t n_pos = log.find(error_message, 0);
|
||||
while (n_pos != std::string::npos) {
|
||||
++error_count;
|
||||
n_pos = log.find(error_message, n_pos + 1);
|
||||
}
|
||||
|
||||
HIPRTC_CHECK(hiprtcDestroyProgram(&program));
|
||||
HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
|
||||
REQUIRE(error_count == expected_error_count);
|
||||
}
|
||||
@@ -0,0 +1,381 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <numeric>
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <resource_guards.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include <cmd_options.hh>
|
||||
|
||||
enum class AtomicScopes { device, system };
|
||||
|
||||
template <typename T, AtomicScopes scope> __device__ T perform_atomic_exch(T* address, T val) {
|
||||
if constexpr (scope == AtomicScopes::device) {
|
||||
return atomicExch(address, val);
|
||||
} else if (scope == AtomicScopes::system) {
|
||||
return atomicExch_system(address, val);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, bool use_shared_mem, AtomicScopes scope>
|
||||
__global__ void atomic_exch_kernel_compile_time(T* const global_mem, T* const old_vals) {
|
||||
__shared__ T shared_mem;
|
||||
|
||||
const auto tid = cooperative_groups::this_grid().thread_rank();
|
||||
|
||||
T* const mem = use_shared_mem ? &shared_mem : global_mem;
|
||||
|
||||
if constexpr (use_shared_mem) {
|
||||
if (tid == 0) mem[0] = global_mem[0];
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
old_vals[tid] = perform_atomic_exch<T, scope>(mem, static_cast<T>(tid + 1));
|
||||
|
||||
if constexpr (use_shared_mem) {
|
||||
__syncthreads();
|
||||
if (tid == 0) global_mem[0] = mem[0];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ __device__ T* pitched_offset(T* const ptr, const unsigned int pitch,
|
||||
const unsigned int idx) {
|
||||
const auto byte_ptr = reinterpret_cast<uint8_t*>(ptr);
|
||||
return reinterpret_cast<T*>(byte_ptr + idx * pitch);
|
||||
}
|
||||
|
||||
template <typename T, bool use_shared_mem, AtomicScopes scope>
|
||||
__global__ void atomic_exch_kernel(T* const global_mem, T* const old_vals, const unsigned int width,
|
||||
const unsigned pitch, const T base_val = 0) {
|
||||
extern __shared__ uint8_t shared_mem[];
|
||||
|
||||
const auto tid = cooperative_groups::this_grid().thread_rank();
|
||||
|
||||
T* const mem = use_shared_mem ? reinterpret_cast<T*>(shared_mem) : global_mem;
|
||||
|
||||
if constexpr (use_shared_mem) {
|
||||
if (tid < width) {
|
||||
const auto target = pitched_offset(mem, pitch, tid);
|
||||
*target = *pitched_offset(global_mem, pitch, tid);
|
||||
};
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
old_vals[tid] = perform_atomic_exch<T, scope>(pitched_offset(mem, pitch, tid % width),
|
||||
base_val + static_cast<T>(tid + width));
|
||||
|
||||
if constexpr (use_shared_mem) {
|
||||
__syncthreads();
|
||||
if (tid < width) {
|
||||
const auto target = pitched_offset(global_mem, pitch, tid);
|
||||
*target = *pitched_offset(mem, pitch, tid);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename TestType, bool use_shared_mem, AtomicScopes scope>
|
||||
void AtomicExchSameAddress(const dim3 blocks, const dim3 threads, const LinearAllocs alloc_type) {
|
||||
LinearAllocGuard<TestType> mem_dev(alloc_type, sizeof(TestType));
|
||||
|
||||
const auto thread_count = blocks.x * blocks.y * blocks.z * threads.x * threads.y * threads.z;
|
||||
const auto old_vals_alloc_size = thread_count * sizeof(TestType);
|
||||
LinearAllocGuard<TestType> old_vals_dev(LinearAllocs::hipMalloc, old_vals_alloc_size);
|
||||
std::vector<TestType> old_vals(thread_count + 1);
|
||||
|
||||
|
||||
HIP_CHECK(hipMemset(mem_dev.ptr(), 0, sizeof(TestType)));
|
||||
atomic_exch_kernel_compile_time<TestType, use_shared_mem, scope>
|
||||
<<<blocks, threads>>>(mem_dev.ptr(), old_vals_dev.ptr());
|
||||
HIP_CHECK(
|
||||
hipMemcpy(old_vals.data(), old_vals_dev.ptr(), old_vals_alloc_size, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(old_vals.data() + thread_count, mem_dev.ptr(), sizeof(TestType),
|
||||
hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
// Every thread will exchange its grid-wide linear id into a target location within mem_dev,
|
||||
// receiving back the value previously present therein. This previous value is written to
|
||||
// old_vals_dev.
|
||||
// old_vals_dev will not contain values that the final scheduled warp exchanged into mem_dev, but
|
||||
// mem_dev obviously will.
|
||||
// Given that mem_dev initially contains values in the range [0, width) and that the maximum value
|
||||
// the final thread shall write is thread_count + width - 1, presuming correct operation of
|
||||
// atomicExch, the union of mem_dev and old_vals_dev shall contain values in the range
|
||||
//[0, thread_count + width)
|
||||
std::sort(old_vals.begin(), old_vals.end());
|
||||
for (auto i = 0u; i < old_vals.size(); ++i) {
|
||||
REQUIRE(i == old_vals[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TestType, AtomicScopes scope> void AtomicExchSameAddressTest() {
|
||||
const auto threads = GENERATE(dim3(1024), dim3(1023), dim3(511), dim3(17), dim3(31));
|
||||
|
||||
SECTION("Global memory") {
|
||||
const auto blocks = GENERATE(dim3(20));
|
||||
using LA = LinearAllocs;
|
||||
const auto allocation_type =
|
||||
GENERATE(LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister);
|
||||
AtomicExchSameAddress<TestType, false, AtomicScopes::device>(blocks, threads, allocation_type);
|
||||
}
|
||||
|
||||
SECTION("Shared memory") {
|
||||
const auto blocks = dim3(1);
|
||||
AtomicExchSameAddress<TestType, true, AtomicScopes::device>(blocks, threads,
|
||||
LinearAllocs::hipMalloc);
|
||||
}
|
||||
}
|
||||
|
||||
struct AtomicExchParams {
|
||||
dim3 blocks;
|
||||
dim3 threads;
|
||||
unsigned int num_devices = 1u;
|
||||
unsigned int kernel_count = 1u;
|
||||
unsigned int width = 1u;
|
||||
unsigned int pitch = 0u;
|
||||
unsigned int host_thread_count = 0u;
|
||||
LinearAllocs alloc_type;
|
||||
};
|
||||
|
||||
|
||||
template <typename Derived, typename T, bool use_shared_mem, AtomicScopes scope>
|
||||
class AtomicExchCRTP {
|
||||
public:
|
||||
void run(const AtomicExchParams& p) const {
|
||||
const auto thread_count =
|
||||
p.blocks.x * p.blocks.y * p.blocks.z * p.threads.x * p.threads.y * p.threads.z;
|
||||
|
||||
const auto old_vals_alloc_size = p.kernel_count * thread_count * sizeof(T);
|
||||
std::vector<LinearAllocGuard<T>> old_vals_devs;
|
||||
std::vector<StreamGuard> streams;
|
||||
for (auto i = 0; i < p.num_devices; ++i) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
old_vals_devs.emplace_back(LinearAllocs::hipMalloc, old_vals_alloc_size);
|
||||
for (auto j = 0; j < p.kernel_count; ++j) {
|
||||
streams.emplace_back(Streams::created);
|
||||
}
|
||||
}
|
||||
|
||||
const auto mem_alloc_size = p.width * p.pitch;
|
||||
LinearAllocGuard<T> mem_dev(p.alloc_type, mem_alloc_size);
|
||||
|
||||
const auto host_iters_per_thread =
|
||||
std::max(p.num_devices * p.kernel_count * thread_count / 20, p.width);
|
||||
|
||||
std::vector<T> old_vals(p.num_devices * p.kernel_count * thread_count + p.width +
|
||||
p.host_thread_count * host_iters_per_thread);
|
||||
std::iota(old_vals.begin(), old_vals.begin() + p.width, 0);
|
||||
|
||||
HIP_CHECK(hipMemcpy2D(mem_dev.ptr(), p.pitch, old_vals.data(), sizeof(T), sizeof(T), p.width,
|
||||
hipMemcpyHostToDevice));
|
||||
|
||||
const auto shared_mem_size = use_shared_mem ? mem_alloc_size : 0u;
|
||||
for (auto i = 0u; i < p.num_devices; ++i) {
|
||||
const auto device_offset = i * p.kernel_count * thread_count;
|
||||
for (auto j = 0u; j < p.kernel_count; ++j) {
|
||||
const auto& stream = streams[i * p.kernel_count + j].stream();
|
||||
const auto kern_offset = j * thread_count;
|
||||
const auto old_vals = old_vals_devs[i].ptr() + kern_offset;
|
||||
CastToDerived().LaunchKernel(shared_mem_size, stream, mem_dev.ptr(), old_vals,
|
||||
device_offset + kern_offset, p);
|
||||
}
|
||||
}
|
||||
|
||||
PerformHostAtomicExchange(p.host_thread_count, host_iters_per_thread, mem_dev.host_ptr(),
|
||||
old_vals.data(), p);
|
||||
|
||||
for (auto i = 0u; i < p.num_devices; ++i) {
|
||||
const auto device_offset = i * p.kernel_count * thread_count;
|
||||
HIP_CHECK(hipMemcpy(old_vals.data() + device_offset, old_vals_devs[i].ptr(),
|
||||
old_vals_alloc_size, hipMemcpyDeviceToHost));
|
||||
}
|
||||
HIP_CHECK(hipMemcpy2D(old_vals.data() + p.num_devices * p.kernel_count * thread_count,
|
||||
sizeof(T), mem_dev.ptr(), p.pitch, sizeof(T), p.width,
|
||||
hipMemcpyDeviceToHost));
|
||||
|
||||
CastToDerived().ValidateResults(old_vals);
|
||||
}
|
||||
|
||||
private:
|
||||
const Derived& CastToDerived() const { return static_cast<const Derived&>(*this); }
|
||||
|
||||
static void HostAtomicExchange(const unsigned int iterations, T* mem, T* const old_vals,
|
||||
const unsigned int width, const unsigned pitch, T base_val) {
|
||||
for (auto i = 0u; i < iterations; ++i) {
|
||||
T new_val = base_val + static_cast<T>(i);
|
||||
T old_val;
|
||||
__atomic_exchange(pitched_offset(mem, pitch, i % width), &new_val, &old_val,
|
||||
__ATOMIC_RELAXED);
|
||||
old_vals[i] = old_val;
|
||||
}
|
||||
}
|
||||
|
||||
void PerformHostAtomicExchange(const unsigned int thread_count, const unsigned int iterations,
|
||||
T* mem, T* const old_vals, const AtomicExchParams& p) const {
|
||||
if (thread_count == 0) {
|
||||
return;
|
||||
}
|
||||
const auto dev_threads =
|
||||
p.blocks.x * p.blocks.y * p.blocks.z * p.threads.x * p.threads.y * p.threads.z;
|
||||
const auto host_base_val = p.num_devices * p.kernel_count * dev_threads + p.width;
|
||||
|
||||
std::vector<std::thread> threads;
|
||||
for (auto i = 0u; i < thread_count; ++i) {
|
||||
const auto thread_base_val = host_base_val + i * iterations;
|
||||
threads.push_back(std::thread(HostAtomicExchange, iterations, mem, old_vals + thread_base_val,
|
||||
p.width, p.pitch, thread_base_val));
|
||||
}
|
||||
|
||||
for (auto& th : threads) {
|
||||
th.join();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, bool use_shared_mem, AtomicScopes scope>
|
||||
class AtomicExch
|
||||
: public AtomicExchCRTP<AtomicExch<T, use_shared_mem, scope>, T, use_shared_mem, scope> {
|
||||
public:
|
||||
void LaunchKernel(const unsigned int shared_mem_size, const hipStream_t stream, T* const mem,
|
||||
T* const old_vals, const T base_val, const AtomicExchParams& p) const {
|
||||
atomic_exch_kernel<T, use_shared_mem, scope><<<p.blocks, p.threads, shared_mem_size, stream>>>(
|
||||
mem, old_vals, p.width, p.pitch, base_val);
|
||||
}
|
||||
|
||||
void ValidateResults(std::vector<T>& old_vals) const {
|
||||
std::sort(old_vals.begin(), old_vals.end());
|
||||
for (auto i = 0u; i < old_vals.size(); ++i) {
|
||||
REQUIRE(i == old_vals[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
inline dim3 GenerateAtomicExchThreadDimensions() { return GENERATE(dim3(16), dim3(1024)); }
|
||||
|
||||
inline dim3 GenerateAtomicExchBlockDimensions() {
|
||||
int sm_count = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, 0));
|
||||
return GENERATE_COPY(dim3(sm_count), dim3(sm_count + sm_count / 2));
|
||||
}
|
||||
|
||||
template <typename TestType, AtomicScopes scope>
|
||||
void AtomicExchSingleDeviceSingleKernelTest(const unsigned int width, const unsigned int pitch) {
|
||||
AtomicExchParams params;
|
||||
params.num_devices = 1;
|
||||
params.kernel_count = 1;
|
||||
params.threads = GenerateAtomicExchThreadDimensions();
|
||||
params.width = width;
|
||||
params.pitch = pitch;
|
||||
|
||||
SECTION("Global memory") {
|
||||
params.blocks = GenerateAtomicExchBlockDimensions();
|
||||
using LA = LinearAllocs;
|
||||
for (const auto alloc_type :
|
||||
{LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
|
||||
params.alloc_type = alloc_type;
|
||||
DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
|
||||
AtomicExch<TestType, false, scope>().run(params);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SECTION("Shared memory") {
|
||||
params.blocks = dim3(1);
|
||||
params.alloc_type = LinearAllocs::hipMalloc;
|
||||
AtomicExch<TestType, true, scope>().run(params);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TestType, AtomicScopes scope>
|
||||
void AtomicExchSingleDeviceMultipleKernelTest(const unsigned int kernel_count,
|
||||
const unsigned int width, const unsigned int pitch) {
|
||||
int concurrent_kernels = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, 0));
|
||||
if (!concurrent_kernels) {
|
||||
HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution");
|
||||
return;
|
||||
}
|
||||
|
||||
AtomicExchParams params;
|
||||
params.num_devices = 1;
|
||||
params.kernel_count = kernel_count;
|
||||
params.blocks = GenerateAtomicExchBlockDimensions();
|
||||
params.threads = GenerateAtomicExchThreadDimensions();
|
||||
params.width = width;
|
||||
params.pitch = pitch;
|
||||
|
||||
using LA = LinearAllocs;
|
||||
for (const auto alloc_type :
|
||||
{LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
|
||||
params.alloc_type = alloc_type;
|
||||
DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
|
||||
AtomicExch<TestType, false, scope>().run(params);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TestType>
|
||||
void AtomicExchMultipleDeviceMultipleKernelAndHostTest(const unsigned int num_devices,
|
||||
const unsigned int kernel_count,
|
||||
const unsigned int width,
|
||||
const unsigned int pitch,
|
||||
const unsigned int host_thread_count = 0u) {
|
||||
if (num_devices > 1) {
|
||||
if (HipTest::getDeviceCount() < num_devices) {
|
||||
std::string msg = std::to_string(num_devices) + " devices are required";
|
||||
HipTest::HIP_SKIP_TEST(msg.c_str());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (kernel_count > 1) {
|
||||
for (auto i = 0u; i < num_devices; ++i) {
|
||||
int concurrent_kernels = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, i));
|
||||
if (!concurrent_kernels) {
|
||||
HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
AtomicExchParams params;
|
||||
params.num_devices = num_devices;
|
||||
params.kernel_count = kernel_count;
|
||||
params.blocks = GenerateAtomicExchBlockDimensions();
|
||||
params.threads = GenerateAtomicExchThreadDimensions();
|
||||
params.width = width;
|
||||
params.pitch = pitch;
|
||||
params.host_thread_count = host_thread_count;
|
||||
|
||||
using LA = LinearAllocs;
|
||||
for (const auto alloc_type : {LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
|
||||
params.alloc_type = alloc_type;
|
||||
DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
|
||||
AtomicExch<TestType, false, AtomicScopes::system>().run(params);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,94 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
struct Dummy {
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
/*int atomicExch(int*, int)*/
|
||||
__global__ void atomicExch_int_n1(int* p, int v) { atomicExch(p, p); }
|
||||
__global__ void atomicExch_int_n2(int* p, int v) { atomicExch(&p, v); }
|
||||
__global__ void atomicExch_int_n3(char* p, int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_int_n4(short* p, int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_int_n5(long* p, int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_int_n6(long long* p, int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_int_n7(Dummy* p, int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_int_n8(int* p, Dummy v) { atomicExch(p, v); }
|
||||
|
||||
/*unsigned int atomicExch(unsigned int*, unsigned int)*/
|
||||
__global__ void atomicExch_unsigned_int_n1(unsigned int* p, unsigned int v) { atomicExch(p, p); }
|
||||
__global__ void atomicExch_unsigned_int_n2(unsigned int* p, unsigned int v) { atomicExch(&p, v); }
|
||||
__global__ void atomicExch_unsigned_int_n3(char* p, unsigned int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_unsigned_int_n4(short* p, unsigned int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_unsigned_int_n5(long* p, unsigned int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_unsigned_int_n6(long long* p, unsigned int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_unsigned_int_n7(Dummy* p, unsigned int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_unsigned_int_n8(unsigned int* p, Dummy v) { atomicExch(p, v); }
|
||||
|
||||
// /*unsigned long long atomicExch(unsigned long long*, unsigned long long)*/
|
||||
__global__ void atomicExch_unsigned_long_long_n1(unsigned long long* p, unsigned long long v) {
|
||||
atomicExch(p, p);
|
||||
}
|
||||
__global__ void atomicExch_unsigned_long_long_n2(unsigned long long* p, unsigned long long v) {
|
||||
atomicExch(&p, v);
|
||||
}
|
||||
__global__ void atomicExch_unsigned_long_long_n3(char* p, unsigned long long v) {
|
||||
atomicExch(p, v);
|
||||
}
|
||||
__global__ void atomicExch_unsigned_long_long_n4(short* p, unsigned long long v) {
|
||||
atomicExch(p, v);
|
||||
}
|
||||
__global__ void atomicExch_unsigned_long_long_n5(long* p, unsigned long long v) {
|
||||
atomicExch(p, v);
|
||||
}
|
||||
__global__ void atomicExch_unsigned_long_long_n6(long long* p, unsigned long long v) {
|
||||
atomicExch(p, v);
|
||||
}
|
||||
__global__ void atomicExch_unsigned_long_long_n7(Dummy* p, unsigned long long v) {
|
||||
atomicExch(p, v);
|
||||
}
|
||||
__global__ void atomicExch_unsigned_long_long_n8(unsigned long long* p, Dummy v) {
|
||||
atomicExch(p, v);
|
||||
}
|
||||
|
||||
// /*float atomicExch(float*, float)*/
|
||||
__global__ void atomicExch_float_n1(float* p, float v) { atomicExch(p, p); }
|
||||
__global__ void atomicExch_float_n2(float* p, float v) { atomicExch(&p, v); }
|
||||
__global__ void atomicExch_float_n3(char* p, float v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_float_n4(short* p, float v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_float_n5(long* p, float v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_float_n6(long long* p, float v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_float_n7(Dummy* p, float v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_float_n8(float* p, Dummy v) { atomicExch(p, v); }
|
||||
|
||||
// /*double atomicExch(double*, double)*/
|
||||
__global__ void atomicExch_double_n1(double* p, double v) { atomicExch(p, p); }
|
||||
__global__ void atomicExch_double_n2(double* p, double v) { atomicExch(&p, v); }
|
||||
__global__ void atomicExch_double_n3(char* p, double v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_double_n4(short* p, double v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_double_n5(long* p, double v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_double_n6(long long* p, double v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_double_n7(Dummy* p, double v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_double_n8(double* p, Dummy v) { atomicExch(p, v); }
|
||||
@@ -0,0 +1,124 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
static constexpr auto kAtomicExchInt{
|
||||
R"(
|
||||
struct Dummy {
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
__global__ void atomicExch_int_n1(int* p, int v) { atomicExch(p, p); }
|
||||
__global__ void atomicExch_int_n2(int* p, int v) { atomicExch(&p, v); }
|
||||
__global__ void atomicExch_int_n3(char* p, int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_int_n4(short* p, int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_int_n5(long* p, int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_int_n6(long long* p, int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_int_n7(Dummy* p, int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_int_n8(int* p, Dummy v) { atomicExch(p, v); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kAtomicExchUnsignedInt{
|
||||
R"(
|
||||
struct Dummy {
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
__global__ void atomicExch_unsigned_int_n1(unsigned int* p, unsigned int v) { atomicExch(p, p); }
|
||||
__global__ void atomicExch_unsigned_int_n2(unsigned int* p, unsigned int v) { atomicExch(&p, v); }
|
||||
__global__ void atomicExch_unsigned_int_n3(char* p, unsigned int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_unsigned_int_n4(short* p, unsigned int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_unsigned_int_n5(long* p, unsigned int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_unsigned_int_n6(long long* p, unsigned int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_unsigned_int_n7(Dummy* p, unsigned int v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_unsigned_int_n8(unsigned int* p, Dummy v) { atomicExch(p, v); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kAtomicExchULL{
|
||||
R"(
|
||||
struct Dummy {
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
__global__ void atomicExch_unsigned_long_long_n1(unsigned long long* p, unsigned long long v) {
|
||||
atomicExch(p, p);
|
||||
}
|
||||
__global__ void atomicExch_unsigned_long_long_n2(unsigned long long* p, unsigned long long v) {
|
||||
atomicExch(&p, v);
|
||||
}
|
||||
__global__ void atomicExch_unsigned_long_long_n3(char* p, unsigned long long v) {
|
||||
atomicExch(p, v);
|
||||
}
|
||||
__global__ void atomicExch_unsigned_long_long_n4(short* p, unsigned long long v) {
|
||||
atomicExch(p, v);
|
||||
}
|
||||
__global__ void atomicExch_unsigned_long_long_n5(long* p, unsigned long long v) {
|
||||
atomicExch(p, v);
|
||||
}
|
||||
__global__ void atomicExch_unsigned_long_long_n6(long long* p, unsigned long long v) {
|
||||
atomicExch(p, v);
|
||||
}
|
||||
__global__ void atomicExch_unsigned_long_long_n7(Dummy* p, unsigned long long v) {
|
||||
atomicExch(p, v);
|
||||
}
|
||||
__global__ void atomicExch_unsigned_long_long_n8(unsigned long long* p, Dummy v) {
|
||||
atomicExch(p, v);
|
||||
}
|
||||
)"};
|
||||
|
||||
static constexpr auto kAtomicExchFloat{
|
||||
R"(
|
||||
struct Dummy {
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
__global__ void atomicExch_float_n1(float* p, float v) { atomicExch(p, p); }
|
||||
__global__ void atomicExch_float_n2(float* p, float v) { atomicExch(&p, v); }
|
||||
__global__ void atomicExch_float_n3(char* p, float v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_float_n4(short* p, float v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_float_n5(long* p, float v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_float_n6(long long* p, float v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_float_n7(Dummy* p, float v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_float_n8(float* p, Dummy v) { atomicExch(p, v); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kAtomicExchDouble{
|
||||
R"(
|
||||
struct Dummy {
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
__global__ void atomicExch_double_n1(double* p, double v) { atomicExch(p, p); }
|
||||
__global__ void atomicExch_double_n2(double* p, double v) { atomicExch(&p, v); }
|
||||
__global__ void atomicExch_double_n3(char* p, double v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_double_n4(short* p, double v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_double_n5(long* p, double v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_double_n6(long long* p, double v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_double_n7(Dummy* p, double v) { atomicExch(p, v); }
|
||||
__global__ void atomicExch_double_n8(double* p, Dummy v) { atomicExch(p, v); }
|
||||
)"};
|
||||
@@ -0,0 +1,235 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "atomicExch_common.hh"
|
||||
#include "atomicExch_system_negative_kernels_rtc.hh"
|
||||
|
||||
/**
|
||||
* @addtogroup atomicExch_system atomicExch_system
|
||||
* @{
|
||||
* @ingroup AtomicsTest
|
||||
*/
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes a kernel two times concurrently on two devices wherein all threads will perform
|
||||
* an atomic exchange into a runtime determined memory location. Each thread will exchange its own
|
||||
* grid wide linear index + offset into the memory location, storing the return value into a
|
||||
* separate output array slot corresponding to it. Once complete, the union of output array and
|
||||
* exchange memory is validated to contain all values in the range [0, number_of_threads +
|
||||
* number_of_exchange_memory_slots). Several memory access patterns are tested:
|
||||
* -# All threads exchange to a single memory location
|
||||
* -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
|
||||
* for indexing
|
||||
* -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
|
||||
*
|
||||
* - The test is run for:
|
||||
* - All overloads of atomicExch_system
|
||||
* - hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
|
||||
* - Several grid and block dimension combinations
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/atomics/atomicExch_system.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
#if HT_NVIDIA
|
||||
TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Peer_GPUs", "", int, unsigned int,
|
||||
unsigned long long, float) {
|
||||
#else
|
||||
TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Peer_GPUs", "", int, unsigned int,
|
||||
unsigned long, unsigned long long, float, double) {
|
||||
#endif // HT_NVIDIA
|
||||
int warp_size = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
|
||||
const auto cache_line_size = 128u;
|
||||
|
||||
for (auto current = 0; current < cmd_options.iterations; ++current) {
|
||||
DYNAMIC_SECTION("Same address " << current) {
|
||||
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, 1, sizeof(TestType));
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Adjacent addresses " << current) {
|
||||
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, warp_size,
|
||||
sizeof(TestType));
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Scattered addresses " << current) {
|
||||
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, warp_size, cache_line_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes a kernel on a single device wherein all threads will perform an atomic exchange
|
||||
* into a runtime determined memory location. Each thread will exchange its own grid wide linear
|
||||
* index + offset into the memory location, storing the return value into a separate output array
|
||||
* slot corresponding to it. While the kernel is running, the host performs atomic exchanges, in 4
|
||||
* threads, into the same memory location(s). Once complete, the union of output array, exchange
|
||||
* memory, and host output is validated to contain all values in the range [0, number_of_threads +
|
||||
* number_of_exchange_memory_slots + number_of_host_iterations). Several memory access patterns are
|
||||
* tested:
|
||||
* -# All threads exchange to a single memory location
|
||||
* -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
|
||||
* for indexing
|
||||
* -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
|
||||
*
|
||||
* - The test is run for:
|
||||
* - All overloads of atomicExch_system
|
||||
* - hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
|
||||
* - Several grid and block dimension combinations
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/atomics/atomicExch_system.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
#if HT_NVIDIA
|
||||
TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Host_And_GPU", "", int, unsigned int,
|
||||
unsigned long long, float) {
|
||||
#else
|
||||
TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Host_And_GPU", "", int, unsigned int,
|
||||
unsigned long, unsigned long long, float, double) {
|
||||
#endif // HT_NVIDIA
|
||||
int warp_size = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
|
||||
const auto cache_line_size = 128u;
|
||||
|
||||
for (auto current = 0; current < cmd_options.iterations; ++current) {
|
||||
DYNAMIC_SECTION("Same address " << current) {
|
||||
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(1, 1, 1, sizeof(TestType), 4);
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Adjacent addresses " << current) {
|
||||
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(1, 1, warp_size, sizeof(TestType),
|
||||
4);
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Scattered addresses " << current) {
|
||||
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(1, 1, warp_size, cache_line_size,
|
||||
4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Executes a kernel two times concurrently on two devices wherein all threads will perform
|
||||
* an atomic exchange into a runtime determined memory location. Each thread will exchange its own
|
||||
* grid wide linear index + offset into the memory location, storing the return value into a
|
||||
* separate output array slot corresponding to it. While the kernels are running, the
|
||||
* host performs atomic exchanges, in 4 threads, into the same memory location(s). Once complete,
|
||||
* the union of output array, exchange memory, and host output is validated to contain all values in
|
||||
* the range [0, number_of_threads + number_of_exchange_memory_slots + number_of_host_iterations).
|
||||
* Several memory access patterns are tested:
|
||||
* -# All threads exchange to a single memory location
|
||||
* -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
|
||||
* for indexing
|
||||
* -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
|
||||
*
|
||||
* - The test is run for:
|
||||
* - All overloads of atomicExch_system
|
||||
* - hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
|
||||
* - Several grid and block dimension combinations
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/atomics/atomicExch_system.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
#if HT_NVIDIA
|
||||
TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Host_And_Peer_GPUs", "", int, unsigned int,
|
||||
unsigned long long, float) {
|
||||
#else
|
||||
TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Host_And_Peer_GPUs", "", int, unsigned int,
|
||||
unsigned long, unsigned long long, float, double) {
|
||||
#endif // HT_NVIDIA
|
||||
int warp_size = 0;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
|
||||
const auto cache_line_size = 128u;
|
||||
|
||||
for (auto current = 0; current < cmd_options.iterations; ++current) {
|
||||
DYNAMIC_SECTION("Same address " << current) {
|
||||
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, 1, sizeof(TestType), 4);
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Adjacent addresses " << current) {
|
||||
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, warp_size, sizeof(TestType),
|
||||
4);
|
||||
}
|
||||
|
||||
DYNAMIC_SECTION("Scattered addresses " << current) {
|
||||
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, warp_size, cache_line_size,
|
||||
4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - RTCs kernels that pass combinations of arguments of invalid types for all overloads of
|
||||
* atomicExch_system
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/atomics/atomicExch_system.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Unit_atomicExch_system_Negative_Parameters_RTC") {
|
||||
hiprtcProgram program{};
|
||||
|
||||
const auto program_source =
|
||||
GENERATE(kAtomicExchSystemInt, kAtomicExchSystemUnsignedInt, kAtomicExchSystemULL,
|
||||
kAtomicExchSystemFloat, kAtomicExchSystemDouble);
|
||||
HIPRTC_CHECK(
|
||||
hiprtcCreateProgram(&program, program_source, "atomicExch_negative.cc", 0, nullptr, nullptr));
|
||||
hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
|
||||
|
||||
// Get the compile log and count compiler error messages
|
||||
size_t log_size{};
|
||||
HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
|
||||
std::string log(log_size, ' ');
|
||||
HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
|
||||
int error_count{0};
|
||||
|
||||
int expected_error_count{8};
|
||||
std::string error_message{"error:"};
|
||||
|
||||
size_t n_pos = log.find(error_message, 0);
|
||||
while (n_pos != std::string::npos) {
|
||||
++error_count;
|
||||
n_pos = log.find(error_message, n_pos + 1);
|
||||
}
|
||||
|
||||
HIPRTC_CHECK(hiprtcDestroyProgram(&program));
|
||||
HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
|
||||
REQUIRE(error_count == expected_error_count);
|
||||
}
|
||||
@@ -0,0 +1,112 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
struct Dummy {
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
/*int atomicExch_system(int*, int)*/
|
||||
__global__ void atomicExch_system_int_n1(int* p, int v) { atomicExch_system(p, p); }
|
||||
__global__ void atomicExch_system_int_n2(int* p, int v) { atomicExch_system(&p, v); }
|
||||
__global__ void atomicExch_system_int_n3(char* p, int v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_int_n4(short* p, int v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_int_n5(long* p, int v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_int_n6(long long* p, int v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_int_n7(Dummy* p, int v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_int_n8(int* p, Dummy v) { atomicExch_system(p, v); }
|
||||
|
||||
/*unsigned int atomicExch_system(unsigned int*, unsigned int)*/
|
||||
__global__ void atomicExch_system_unsigned_int_n1(unsigned int* p, unsigned int v) {
|
||||
atomicExch_system(p, p);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_int_n2(unsigned int* p, unsigned int v) {
|
||||
atomicExch_system(&p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_int_n3(char* p, unsigned int v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_int_n4(short* p, unsigned int v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_int_n5(long* p, unsigned int v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_int_n6(long long* p, unsigned int v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_int_n7(Dummy* p, unsigned int v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_int_n8(unsigned int* p, Dummy v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
|
||||
// /*unsigned long long atomicExch_system(unsigned long long*, unsigned long long)*/
|
||||
__global__ void atomicExch_system_unsigned_long_long_n1(unsigned long long* p,
|
||||
unsigned long long v) {
|
||||
atomicExch_system(p, p);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_long_long_n2(unsigned long long* p,
|
||||
unsigned long long v) {
|
||||
atomicExch_system(&p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_long_long_n3(char* p, unsigned long long v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_long_long_n4(short* p, unsigned long long v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_long_long_n5(long* p, unsigned long long v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_long_long_n6(long long* p, unsigned long long v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_long_long_n7(Dummy* p, unsigned long long v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_long_long_n8(unsigned long long* p, Dummy v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
|
||||
// /*float atomicExch_system(float*, float)*/
|
||||
__global__ void atomicExch_system_float_n1(float* p, float v) { atomicExch_system(p, p); }
|
||||
__global__ void atomicExch_system_float_n2(float* p, float v) { atomicExch_system(&p, v); }
|
||||
__global__ void atomicExch_system_float_n3(char* p, float v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_float_n4(short* p, float v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_float_n5(long* p, float v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_float_n6(long long* p, float v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_float_n7(Dummy* p, float v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_float_n8(float* p, Dummy v) { atomicExch_system(p, v); }
|
||||
|
||||
// /*double atomicExch_system(double*, double)*/
|
||||
__global__ void atomicExch_system_double_n1(double* p, double v) { atomicExch_system(p, p); }
|
||||
__global__ void atomicExch_system_double_n2(double* p, double v) { atomicExch_system(&p, v); }
|
||||
__global__ void atomicExch_system_double_n3(char* p, double v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_double_n4(short* p, double v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_double_n5(long* p, double v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_double_n6(long long* p, double v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_double_n7(Dummy* p, double v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_double_n8(double* p, Dummy v) { atomicExch_system(p, v); }
|
||||
@@ -0,0 +1,142 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
static constexpr auto kAtomicExchSystemInt{
|
||||
R"(
|
||||
struct Dummy {
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
__global__ void atomicExch_system_int_n1(int* p, int v) { atomicExch_system(p, p); }
|
||||
__global__ void atomicExch_system_int_n2(int* p, int v) { atomicExch_system(&p, v); }
|
||||
__global__ void atomicExch_system_int_n3(char* p, int v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_int_n4(short* p, int v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_int_n5(long* p, int v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_int_n6(long long* p, int v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_int_n7(Dummy* p, int v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_int_n8(int* p, Dummy v) { atomicExch_system(p, v); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kAtomicExchSystemUnsignedInt{
|
||||
R"(
|
||||
struct Dummy {
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
__global__ void atomicExch_system_unsigned_int_n1(unsigned int* p, unsigned int v) {
|
||||
atomicExch_system(p, p);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_int_n2(unsigned int* p, unsigned int v) {
|
||||
atomicExch_system(&p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_int_n3(char* p, unsigned int v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_int_n4(short* p, unsigned int v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_int_n5(long* p, unsigned int v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_int_n6(long long* p, unsigned int v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_int_n7(Dummy* p, unsigned int v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_int_n8(unsigned int* p, Dummy v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
)"};
|
||||
|
||||
static constexpr auto kAtomicExchSystemULL{
|
||||
R"(
|
||||
struct Dummy {
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
__global__ void atomicExch_system_unsigned_long_long_n1(unsigned long long* p,
|
||||
unsigned long long v) {
|
||||
atomicExch_system(p, p);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_long_long_n2(unsigned long long* p,
|
||||
unsigned long long v) {
|
||||
atomicExch_system(&p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_long_long_n3(char* p, unsigned long long v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_long_long_n4(short* p, unsigned long long v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_long_long_n5(long* p, unsigned long long v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_long_long_n6(long long* p, unsigned long long v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_long_long_n7(Dummy* p, unsigned long long v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
__global__ void atomicExch_system_unsigned_long_long_n8(unsigned long long* p, Dummy v) {
|
||||
atomicExch_system(p, v);
|
||||
}
|
||||
)"};
|
||||
|
||||
static constexpr auto kAtomicExchSystemFloat{
|
||||
R"(
|
||||
struct Dummy {
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
__global__ void atomicExch_system_float_n1(float* p, float v) { atomicExch_system(p, p); }
|
||||
__global__ void atomicExch_system_float_n2(float* p, float v) { atomicExch_system(&p, v); }
|
||||
__global__ void atomicExch_system_float_n3(char* p, float v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_float_n4(short* p, float v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_float_n5(long* p, float v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_float_n6(long long* p, float v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_float_n7(Dummy* p, float v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_float_n8(float* p, Dummy v) { atomicExch_system(p, v); }
|
||||
)"};
|
||||
|
||||
static constexpr auto kAtomicExchSystemDouble{
|
||||
R"(
|
||||
struct Dummy {
|
||||
__device__ Dummy() {}
|
||||
__device__ ~Dummy() {}
|
||||
};
|
||||
|
||||
__global__ void atomicExch_system_double_n1(double* p, double v) { atomicExch_system(p, p); }
|
||||
__global__ void atomicExch_system_double_n2(double* p, double v) { atomicExch_system(&p, v); }
|
||||
__global__ void atomicExch_system_double_n3(char* p, double v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_double_n4(short* p, double v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_double_n5(long* p, double v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_double_n6(long long* p, double v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_double_n7(Dummy* p, double v) { atomicExch_system(p, v); }
|
||||
__global__ void atomicExch_system_double_n8(double* p, Dummy v) { atomicExch_system(p, v); }
|
||||
)"};
|
||||
@@ -0,0 +1,107 @@
|
||||
# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
class CompileAndCapture(unittest.TestCase):
|
||||
path = None
|
||||
expected_error_count = 0
|
||||
expected_warning_count = 0
|
||||
hip_path = None
|
||||
file = None
|
||||
error_string = None
|
||||
warning_string = None
|
||||
platform = None
|
||||
|
||||
def setUp(self):
|
||||
self.error_string = 'error:'
|
||||
self.warning_string = 'warning:'
|
||||
self.assertFalse(self.hip_path == None)
|
||||
self.assertFalse(self.path == None)
|
||||
self.assertFalse(self.file == None)
|
||||
self.assertTrue(self.platform == 'amd' or self.platform == 'nvidia')
|
||||
|
||||
def test(self):
|
||||
compiler_args = [
|
||||
self.hip_path + '/bin/hipcc',
|
||||
'-I' + self.path + '/../../external/Catch2',
|
||||
'-I' + self.path + '/../../include',
|
||||
'-I' + self.path + '/../../external/picojson',
|
||||
'-c',
|
||||
self.path + '/' + self.file,
|
||||
]
|
||||
# HIP compiler on AMD platforms has limit of 20 errors, and some negative
|
||||
# test cases expect that more errors are detected.
|
||||
if (self.platform == 'amd'):
|
||||
compiler_args.append('-ferror-limit=100')
|
||||
compiler_output = subprocess.run(compiler_args, stderr=subprocess.PIPE)
|
||||
# Get the compiler output in the stdout if -V flag is raised during ctest invocation.
|
||||
compiler_stderr = compiler_output.stderr.decode('UTF-8')
|
||||
print(compiler_stderr)
|
||||
|
||||
error_count = compiler_stderr.count(self.error_string)
|
||||
if self.expected_error_count < 0:
|
||||
self.assertGreater(error_count, 0)
|
||||
else:
|
||||
self.assertEqual(error_count, self.expected_error_count)
|
||||
|
||||
warning_count = compiler_stderr.count(self.warning_string)
|
||||
if self.expected_warning_count < 0:
|
||||
self.assertGreater(warning_count, 0)
|
||||
else:
|
||||
self.assertEqual(warning_count, self.expected_warning_count)
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
CompileAndCapture.path = sys.argv[1]
|
||||
except IndexError:
|
||||
CompileAndCapture.path = None
|
||||
|
||||
try:
|
||||
CompileAndCapture.platform = sys.argv[2]
|
||||
except IndexError:
|
||||
CompileAndCapture.platform = None
|
||||
|
||||
try:
|
||||
CompileAndCapture.hip_path = sys.argv[3]
|
||||
except IndexError:
|
||||
CompileAndCapture.hip_path = None
|
||||
|
||||
try:
|
||||
CompileAndCapture.file = sys.argv[4]
|
||||
except IndexError:
|
||||
CompileAndCapture.file = None
|
||||
|
||||
try:
|
||||
CompileAndCapture.expected_error_count = int(sys.argv[5])
|
||||
except IndexError:
|
||||
CompileAndCapture.expected_error_count = 0
|
||||
|
||||
try:
|
||||
CompileAndCapture.expected_warning_count = int(sys.argv[6])
|
||||
except IndexError:
|
||||
CompileAndCapture.expected_warning_count = 0
|
||||
|
||||
# Unittest looks at the same argv's as the __main__ and doesn't know how
|
||||
# to handle arguments other than the executable (0). Therefore passing only
|
||||
# executable as the argv for unittest module.
|
||||
unittest.main(argv=[sys.argv[0]])
|
||||
@@ -1,26 +1,25 @@
|
||||
# Common Tests - Test independent of all platforms
|
||||
set(TEST_SRC
|
||||
hipCGThreadBlockType.cc
|
||||
hipCGThreadBlockTypeViaBaseType.cc
|
||||
hipCGThreadBlockTypeViaPublicApi.cc
|
||||
hipCGMultiGridGroupType.cc
|
||||
hipCGMultiGridGroupTypeViaBaseType.cc
|
||||
hipCGMultiGridGroupTypeViaPublicApi.cc
|
||||
hipCGThreadBlockType_old.cc
|
||||
hipCGMultiGridGroupType_old.cc
|
||||
hipCGGridGroupType_old.cc
|
||||
hipCGTiledPartitionType_old.cc
|
||||
hipCGThreadBlockTileTypeShfl_old.cc
|
||||
hipCGCoalescedGroups_old.cc
|
||||
hipLaunchCooperativeKernel_old.cc
|
||||
hipLaunchCooperativeKernelMultiDevice_old.cc
|
||||
grid_group.cc
|
||||
coalesced_groups_shfl_down.cc
|
||||
coalesced_groups_shfl_up.cc
|
||||
hipCGTiledPartition.cc
|
||||
hipCGCoalescedGroups.cc
|
||||
coalesced_tiled_groups_metagrp.cc
|
||||
)
|
||||
if(HIP_PLATFORM STREQUAL "nvidia")
|
||||
set_source_files_properties(hipCGMultiGridGroupType.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
|
||||
set_source_files_properties(hipCGMultiGridGroupTypeViaBaseType.cc PROPERTIES COMPILE_FLAGS "-D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
|
||||
set_source_files_properties(hipCGMultiGridGroupTypeViaPublicApi.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
|
||||
set_source_files_properties(hipCGMultiGridGroupType_old.cc PROPERTIES COMPILE_FLAGS "-D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
|
||||
set_source_files_properties(hipLaunchCooperativeKernelMultiDevice_old.cc PROPERTIES COMPILE_FLAGS "-D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
|
||||
hip_add_exe_to_target(NAME coopGrpTest
|
||||
TEST_SRC ${TEST_SRC}
|
||||
TEST_TARGET_NAME build_tests
|
||||
LINKER_LIBS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
|
||||
LINKER_LIBS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80, -gencode arch=compute_86,code=sm_86, -gencode=arch=compute_86,code=compute_86")
|
||||
else()
|
||||
hip_add_exe_to_target(NAME coopGrpTest
|
||||
TEST_SRC ${TEST_SRC}
|
||||
|
||||
@@ -0,0 +1,496 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
|
||||
#include "hip_cg_common.hh"
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
static __device__ int gm[2];
|
||||
|
||||
static __global__ void kernel_cg_grid_group_type(int* size_dev, int* thd_rank_dev,
|
||||
int* is_valid_dev, int* sync_dev) {
|
||||
cg::grid_group gg = cg::this_grid();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test size
|
||||
size_dev[gIdx] = gg.size();
|
||||
|
||||
// Test thread_rank
|
||||
thd_rank_dev[gIdx] = gg.thread_rank();
|
||||
|
||||
// Test is_valid
|
||||
is_valid_dev[gIdx] = gg.is_valid();
|
||||
|
||||
// Test sync
|
||||
if (blockIdx.x == 0 && threadIdx.x == 0)
|
||||
gm[0] = 10;
|
||||
else if (blockIdx.x == 1 && threadIdx.x == 0)
|
||||
gm[1] = 20;
|
||||
gg.sync();
|
||||
sync_dev[gIdx] = gm[1] * gm[0];
|
||||
}
|
||||
|
||||
static __global__ void kernel_cg_grid_group_type_via_base_type(int* size_dev, int* thd_rank_dev,
|
||||
int* is_valid_dev, int* sync_dev) {
|
||||
cg::thread_group tg = cg::this_grid();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test size
|
||||
size_dev[gIdx] = tg.size();
|
||||
|
||||
// Test thread_rank
|
||||
thd_rank_dev[gIdx] = tg.thread_rank();
|
||||
|
||||
// Test is_valid
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
is_valid_dev[gIdx] = tg.is_valid();
|
||||
#else
|
||||
// Cuda has no thread_group.is_valid()
|
||||
is_valid_dev[gIdx] = true;
|
||||
#endif
|
||||
|
||||
// Test sync
|
||||
if (blockIdx.x == 0 && threadIdx.x == 0)
|
||||
gm[0] = 10;
|
||||
else if (blockIdx.x == 1 && threadIdx.x == 0)
|
||||
gm[1] = 20;
|
||||
tg.sync();
|
||||
sync_dev[gIdx] = gm[1] * gm[0];
|
||||
}
|
||||
|
||||
static __global__ void kernel_cg_grid_group_type_via_public_api(int* size_dev, int* thd_rank_dev,
|
||||
int* is_valid_dev, int* sync_dev) {
|
||||
cg::grid_group gg = cg::this_grid();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test group_size api
|
||||
size_dev[gIdx] = cg::group_size(gg);
|
||||
|
||||
// Test thread_rank api
|
||||
thd_rank_dev[gIdx] = cg::thread_rank(gg);
|
||||
|
||||
// Test is_valid api
|
||||
is_valid_dev[gIdx] = gg.is_valid();
|
||||
|
||||
// Test sync
|
||||
if (blockIdx.x == 0 && threadIdx.x == 0)
|
||||
gm[0] = 10;
|
||||
else if (blockIdx.x == 1 && threadIdx.x == 0)
|
||||
gm[1] = 20;
|
||||
cg::sync(gg);
|
||||
sync_dev[gIdx] = gm[1] * gm[0];
|
||||
}
|
||||
|
||||
static __global__ void coop_kernel(unsigned int* first_array, unsigned int* second_array,
|
||||
unsigned int loops, unsigned int array_len) {
|
||||
cg::grid_group grid = cg::this_grid();
|
||||
unsigned int rank = grid.thread_rank();
|
||||
unsigned int grid_size = grid.size();
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
// The goal of this loop is to directly add in values from
|
||||
// array one into array two, on a per-wave basis.
|
||||
for (int offset = rank; offset < array_len; offset += grid_size) {
|
||||
second_array[offset] += first_array[offset];
|
||||
}
|
||||
|
||||
grid.sync();
|
||||
|
||||
// The goal of this loop is to pull data the "mirror" lane in
|
||||
// array two and add it back into array one. This causes inter-
|
||||
// thread swizzling.
|
||||
for (int offset = rank; offset < array_len; offset += grid_size) {
|
||||
unsigned int swizzle_offset = array_len - offset - 1;
|
||||
first_array[offset] += second_array[swizzle_offset];
|
||||
}
|
||||
|
||||
grid.sync();
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void test_kernel(unsigned int* atomic_val, unsigned int* array,
|
||||
unsigned int loops) {
|
||||
cg::grid_group grid = cg::this_grid();
|
||||
unsigned rank = grid.thread_rank();
|
||||
|
||||
int offset = blockIdx.x;
|
||||
for (int i = 0; i < loops; i++) {
|
||||
// Make the last thread run way behind everyone else.
|
||||
// If the barrier below fails, then the other threads may hit the
|
||||
// atomicInc instruction many times before the last thread ever gets to it.
|
||||
// As such, without the barrier, the last array entry will eventually
|
||||
// contain a very large value, defined by however many times the other
|
||||
// wavefronts make it through this loop.
|
||||
// If the barrier works, then it will likely contain some number
|
||||
// near "total number of blocks". It will be the last wavefront to
|
||||
// reach the atomicInc, but everyone will have only hit the atomic once.
|
||||
if (rank == (grid.size() - 1)) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = clock64();
|
||||
do {
|
||||
long long cur_clock = clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < 1000000);
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
array[offset] = atomicInc(&atomic_val[0], UINT_MAX);
|
||||
}
|
||||
grid.sync();
|
||||
offset += gridDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void test_kernel_gfx11(unsigned int* atomic_val, unsigned int* array,
|
||||
unsigned int loops) {
|
||||
#if HT_AMD
|
||||
cg::grid_group grid = cg::this_grid();
|
||||
unsigned rank = grid.thread_rank();
|
||||
|
||||
int offset = blockIdx.x;
|
||||
for (int i = 0; i < loops; i++) {
|
||||
// Make the last thread run way behind everyone else.
|
||||
// If the barrier below fails, then the other threads may hit the
|
||||
// atomicInc instruction many times before the last thread ever gets
|
||||
// to it.
|
||||
// As such, without the barrier, the last array entry will eventually
|
||||
// contain a very large value, defined by however many times the other
|
||||
// wavefronts make it through this loop.
|
||||
// If the barrier works, then it will likely contain some number
|
||||
// near "total number of blocks". It will be the last wavefront to
|
||||
// reach the atomicInc, but everyone will have only hit the atomic once.
|
||||
if (rank == (grid.size() - 1)) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = wall_clock64();
|
||||
do {
|
||||
long long cur_clock = wall_clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < 1000000);
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
array[offset] = atomicInc(&atomic_val[0], UINT_MAX);
|
||||
}
|
||||
grid.sync();
|
||||
offset += gridDim.x;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void verify_coop_buffers(unsigned int* host_input, unsigned int* first_array,
|
||||
unsigned int* second_array, unsigned int loops,
|
||||
unsigned int array_len) {
|
||||
unsigned int* expected_first_array = host_input;
|
||||
unsigned int* expected_second_array =
|
||||
reinterpret_cast<unsigned int*>(malloc(sizeof(unsigned int) * array_len));
|
||||
memset(expected_second_array, 0, sizeof(unsigned int) * array_len);
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
for (int offset = 0; offset < array_len; offset++) {
|
||||
expected_second_array[offset] += expected_first_array[offset];
|
||||
}
|
||||
|
||||
for (int offset = 0; offset < array_len; offset++) {
|
||||
unsigned int swizzle_offset = array_len - offset - 1;
|
||||
expected_first_array[offset] += expected_second_array[swizzle_offset];
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < array_len; i++) {
|
||||
REQUIRE(first_array[i] == expected_first_array[i]);
|
||||
REQUIRE(second_array[i] == expected_second_array[i]);
|
||||
}
|
||||
|
||||
free(expected_second_array);
|
||||
}
|
||||
|
||||
static void verify_barrier_buffer(unsigned int loops, unsigned int warps,
|
||||
unsigned int* host_buffer) {
|
||||
unsigned int max_in_this_loop = 0;
|
||||
for (unsigned int i = 0; i < loops; i++) {
|
||||
max_in_this_loop += warps;
|
||||
for (unsigned int j = 0; j < warps; j++) {
|
||||
REQUIRE(host_buffer[i * warps + j] <= max_in_this_loop);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename F> static void test_cg_grid_group_type(F kernel_func, int block_size) {
|
||||
int num_bytes = sizeof(int) * 2 * block_size;
|
||||
int *size_dev, *size_host;
|
||||
int *thd_rank_dev, *thd_rank_host;
|
||||
int *is_valid_dev, *is_valid_host;
|
||||
int *sync_dev, *sync_host;
|
||||
|
||||
// Allocate device memory
|
||||
HIP_CHECK(hipMalloc(&size_dev, num_bytes));
|
||||
HIP_CHECK(hipMalloc(&thd_rank_dev, num_bytes));
|
||||
HIP_CHECK(hipMalloc(&is_valid_dev, num_bytes));
|
||||
HIP_CHECK(hipMalloc(&sync_dev, num_bytes));
|
||||
|
||||
// Allocate host memory
|
||||
HIP_CHECK(hipHostMalloc(&size_host, num_bytes));
|
||||
HIP_CHECK(hipHostMalloc(&thd_rank_host, num_bytes));
|
||||
HIP_CHECK(hipHostMalloc(&is_valid_host, num_bytes));
|
||||
HIP_CHECK(hipHostMalloc(&sync_host, num_bytes));
|
||||
|
||||
// Launch Kernel
|
||||
void* params[4];
|
||||
params[0] = &size_dev;
|
||||
params[1] = &thd_rank_dev;
|
||||
params[2] = &is_valid_dev;
|
||||
params[3] = &sync_dev;
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(kernel_func, 2, block_size, params, 0, 0));
|
||||
|
||||
// Copy result from device to host
|
||||
HIP_CHECK(hipMemcpy(size_host, size_dev, num_bytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(thd_rank_host, thd_rank_dev, num_bytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(is_valid_host, is_valid_dev, num_bytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(sync_host, sync_dev, num_bytes, hipMemcpyDeviceToHost));
|
||||
|
||||
// Validate results for both blocks together
|
||||
for (int i = 0; i < 2 * block_size; ++i) {
|
||||
ASSERT_EQUAL(size_host[i], 2 * block_size);
|
||||
ASSERT_EQUAL(thd_rank_host[i], i);
|
||||
ASSERT_EQUAL(is_valid_host[i], 1);
|
||||
ASSERT_EQUAL(sync_host[i], 200);
|
||||
}
|
||||
|
||||
// Free device memory
|
||||
HIP_CHECK(hipFree(size_dev));
|
||||
HIP_CHECK(hipFree(thd_rank_dev));
|
||||
HIP_CHECK(hipFree(is_valid_dev));
|
||||
HIP_CHECK(hipFree(sync_dev));
|
||||
|
||||
// Free host memory
|
||||
HIP_CHECK(hipHostFree(size_host));
|
||||
HIP_CHECK(hipHostFree(thd_rank_host));
|
||||
HIP_CHECK(hipHostFree(is_valid_host));
|
||||
HIP_CHECK(hipHostFree(sync_host));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGGridGroupType_Basic") {
|
||||
// Use default device for validating the test
|
||||
int device;
|
||||
hipDeviceProp_t device_properties;
|
||||
HIP_CHECK(hipGetDevice(&device));
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
||||
|
||||
if (!device_properties.cooperativeLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
|
||||
void* (*kernel_func)(void);
|
||||
|
||||
SECTION("Default grid group API test") {
|
||||
kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_grid_group_type);
|
||||
}
|
||||
#if HT_AMD
|
||||
SECTION("Base type grid group API test") {
|
||||
kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_grid_group_type_via_base_type);
|
||||
}
|
||||
#endif
|
||||
|
||||
SECTION("Public API grid group test") {
|
||||
kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_grid_group_type_via_public_api);
|
||||
}
|
||||
|
||||
// Test for block_size in powers of 2
|
||||
int max_threads_per_blk = device_properties.maxThreadsPerBlock;
|
||||
for (int block_size = 2; block_size <= max_threads_per_blk; block_size = block_size * 2) {
|
||||
test_cg_grid_group_type(kernel_func, block_size);
|
||||
}
|
||||
|
||||
// Test for random blockSizes, but the sequence is the same every execution
|
||||
srand(0);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
// Test fails for only 1 thread per block
|
||||
test_cg_grid_group_type(kernel_func, max(2, rand() % max_threads_per_blk));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGGridGroupType_DataSharing") {
|
||||
const auto device = GENERATE(range(0, HipTest::getDeviceCount()));
|
||||
HIP_CHECK(hipSetDevice(device));
|
||||
|
||||
hipDeviceProp_t device_properties;
|
||||
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
||||
|
||||
if (!device_properties.cooperativeLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
|
||||
int loops = GENERATE(1, 2, 3, 4);
|
||||
int width = GENERATE(512, 1024, 2048, 4096);
|
||||
|
||||
// Launch enough waves to fill up all of the GPU
|
||||
int warp_size = device_properties.warpSize;
|
||||
int num_sms = device_properties.multiProcessorCount;
|
||||
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
int max_blocks_per_sm;
|
||||
HIP_CHECK(
|
||||
hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, coop_kernel, warp_size, 0));
|
||||
|
||||
int num_blocks = max_blocks_per_sm * num_sms;
|
||||
|
||||
// Create Streams
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
// Allocate and initialize data
|
||||
|
||||
// Alocate the host input buffer, and two device buffers
|
||||
unsigned int* input_buffer =
|
||||
reinterpret_cast<unsigned int*>(malloc(sizeof(unsigned int) * width));
|
||||
for (int i = 0; i < width; i++) {
|
||||
input_buffer[i] = i;
|
||||
}
|
||||
|
||||
unsigned int *dev_mem_1, *host_mem_1;
|
||||
host_mem_1 = reinterpret_cast<unsigned int*>(malloc(sizeof(unsigned int) * width));
|
||||
HIP_CHECK(hipMalloc(&dev_mem_1, sizeof(unsigned int) * width));
|
||||
HIP_CHECK(hipMemcpyAsync(dev_mem_1, input_buffer, sizeof(unsigned int) * width,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
|
||||
unsigned int *dev_mem_2, *host_mem_2;
|
||||
host_mem_2 = reinterpret_cast<unsigned int*>(malloc(sizeof(unsigned int) * width));
|
||||
HIP_CHECK(hipMalloc(&dev_mem_2, sizeof(unsigned int) * width));
|
||||
HIP_CHECK(hipMemsetAsync(dev_mem_2, 0, width * sizeof(unsigned int), stream));
|
||||
|
||||
// Launch the kernels
|
||||
INFO("Launching a cooperative kernel with " << num_blocks << " blocks, each with " << warp_size
|
||||
<< " threads");
|
||||
|
||||
void* coop_params[4];
|
||||
coop_params[0] = reinterpret_cast<void*>(&dev_mem_1);
|
||||
coop_params[1] = reinterpret_cast<void*>(&dev_mem_2);
|
||||
coop_params[2] = reinterpret_cast<void*>(&loops);
|
||||
coop_params[3] = reinterpret_cast<void*>(&width);
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(coop_kernel, num_blocks, warp_size, coop_params, 0, stream));
|
||||
|
||||
// Read back the buffers and print out their data
|
||||
HIP_CHECK(hipMemcpyAsync(host_mem_1, dev_mem_1, sizeof(unsigned int) * width,
|
||||
hipMemcpyDeviceToHost, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(host_mem_2, dev_mem_2, sizeof(unsigned int) * width,
|
||||
hipMemcpyDeviceToHost, stream));
|
||||
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
verify_coop_buffers(input_buffer, host_mem_1, host_mem_2, loops, width);
|
||||
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
HIP_CHECK(hipFree(dev_mem_1));
|
||||
HIP_CHECK(hipFree(dev_mem_2));
|
||||
free(input_buffer);
|
||||
free(host_mem_1);
|
||||
free(host_mem_2);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGGridGroupType_Barrier") {
|
||||
const auto device = GENERATE(range(0, HipTest::getDeviceCount()));
|
||||
HIP_CHECK(hipSetDevice(device));
|
||||
|
||||
hipDeviceProp_t device_properties;
|
||||
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
||||
|
||||
if (!device_properties.cooperativeLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t loops = GENERATE(1, 2, 3, 4);
|
||||
uint32_t warps = GENERATE(4, 8, 16, 32);
|
||||
uint32_t block_size = 1;
|
||||
|
||||
// Test whether the requested size will fit on the GPU
|
||||
int max_blocks_per_sm;
|
||||
int warp_size = device_properties.warpSize;
|
||||
int num_sms = device_properties.multiProcessorCount;
|
||||
|
||||
int num_threads_in_block = block_size * warp_size;
|
||||
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, test_kernel_used,
|
||||
num_threads_in_block, 0));
|
||||
|
||||
int requested_blocks = warps / block_size;
|
||||
if (requested_blocks > max_blocks_per_sm * num_sms) {
|
||||
INFO("Too many blocks requested!");
|
||||
REQUIRE(false);
|
||||
}
|
||||
|
||||
// Each block will output a single value per loop.
|
||||
uint32_t total_buffer_len = requested_blocks * loops;
|
||||
|
||||
// Alocate the buffer that will hold the kernel's output, and which will
|
||||
// also be used to globally synchronize during GWS initialization
|
||||
unsigned int* host_buffer =
|
||||
reinterpret_cast<unsigned int*>(calloc(total_buffer_len, sizeof(unsigned int)));
|
||||
|
||||
unsigned int* kernel_buffer;
|
||||
HIP_CHECK(hipMalloc(&kernel_buffer, sizeof(unsigned int) * total_buffer_len));
|
||||
HIP_CHECK(hipMemcpy(kernel_buffer, host_buffer, sizeof(unsigned int) * total_buffer_len,
|
||||
hipMemcpyHostToDevice));
|
||||
|
||||
unsigned int* kernel_atomic;
|
||||
HIP_CHECK(hipMalloc(&kernel_atomic, sizeof(unsigned int)));
|
||||
HIP_CHECK(hipMemset(kernel_atomic, 0, sizeof(unsigned int)));
|
||||
|
||||
// Launch the kernel
|
||||
INFO("Launching a cooperative kernel with " << warps << " warps in " << requested_blocks
|
||||
<< " thread blocks");
|
||||
|
||||
void* params[3];
|
||||
params[0] = reinterpret_cast<void*>(&kernel_atomic);
|
||||
params[1] = reinterpret_cast<void*>(&kernel_buffer);
|
||||
params[2] = reinterpret_cast<void*>(&loops);
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(test_kernel_used, requested_blocks, num_threads_in_block,
|
||||
params, 0, 0));
|
||||
|
||||
// Read back the buffer to host
|
||||
HIP_CHECK(hipMemcpy(host_buffer, kernel_buffer, sizeof(unsigned int) * total_buffer_len,
|
||||
hipMemcpyDeviceToHost));
|
||||
|
||||
verify_barrier_buffer(loops, requested_blocks, host_buffer);
|
||||
|
||||
HIP_CHECK(hipFree(kernel_buffer));
|
||||
HIP_CHECK(hipFree(kernel_atomic));
|
||||
free(host_buffer);
|
||||
}
|
||||
@@ -1,240 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
|
||||
#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
|
||||
#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
|
||||
#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
|
||||
|
||||
using namespace cooperative_groups;
|
||||
constexpr int MaxGPUs = 8;
|
||||
|
||||
static __global__
|
||||
void kernel_cg_multi_grid_group_type(int* numGridsTestD,
|
||||
int* gridRankTestD,
|
||||
int *sizeTestD,
|
||||
int *thdRankTestD,
|
||||
int *isValidTestD,
|
||||
int *syncTestD,
|
||||
int *syncResultD)
|
||||
{
|
||||
multi_grid_group mg = this_multi_grid();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test num_grids
|
||||
numGridsTestD[gIdx] = mg.num_grids();
|
||||
|
||||
// Test grid_rank
|
||||
gridRankTestD[gIdx] = mg.grid_rank();
|
||||
|
||||
// Test size
|
||||
sizeTestD[gIdx] = mg.size();
|
||||
|
||||
// Test thread_rank
|
||||
thdRankTestD[gIdx] = mg.thread_rank();
|
||||
|
||||
// Test is_valid
|
||||
isValidTestD[gIdx] = mg.is_valid();
|
||||
|
||||
// Test sync
|
||||
//
|
||||
// Eech thread assign 1 to their respective location
|
||||
syncTestD[gIdx] = 1;
|
||||
// Grid level sync
|
||||
this_grid().sync();
|
||||
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
|
||||
if (blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
|
||||
syncTestD[0] += syncTestD[i];
|
||||
}
|
||||
syncResultD[mg.grid_rank() + 1] = syncTestD[0];
|
||||
}
|
||||
// multi-grid level sync
|
||||
mg.sync();
|
||||
// grid (gpu) 0 does final reduction across all grids (gpus)
|
||||
if (mg.grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
syncResultD[0] = 0;
|
||||
for (uint i = 1; i <= mg.num_grids(); ++i) {
|
||||
syncResultD[0] += syncResultD[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void test_cg_multi_grid_group_type(int blockSize, int nGpu)
|
||||
{
|
||||
// Create a stream each device
|
||||
hipStream_t stream[MaxGPUs];
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device
|
||||
HIPCHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
|
||||
// Allocate host and device memory
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int *numGridsTestD[MaxGPUs], *numGridsTestH[MaxGPUs];
|
||||
int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
|
||||
int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
|
||||
int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
|
||||
int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
|
||||
int *syncTestD[MaxGPUs], *syncResultD;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
HIPCHECK(hipMalloc(&numGridsTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
|
||||
|
||||
HIPCHECK(hipHostMalloc(&numGridsTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
|
||||
|
||||
if (i == 0) {
|
||||
HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
|
||||
}
|
||||
}
|
||||
|
||||
// Launch Kernel
|
||||
constexpr int NumKernelArgs = 7;
|
||||
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
|
||||
void* args[MaxGPUs * NumKernelArgs];
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
args[i * NumKernelArgs] = &numGridsTestD[i];
|
||||
args[i * NumKernelArgs + 1] = &gridRankTestD[i];
|
||||
args[i * NumKernelArgs + 2] = &sizeTestD[i];
|
||||
args[i * NumKernelArgs + 3] = &thdRankTestD[i];
|
||||
args[i * NumKernelArgs + 4] = &isValidTestD[i];
|
||||
args[i * NumKernelArgs + 5] = &syncTestD[i];
|
||||
args[i * NumKernelArgs + 6] = &syncResultD;
|
||||
|
||||
launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type);
|
||||
launchParamsList[i].gridDim = 2;
|
||||
launchParamsList[i].blockDim = blockSize;
|
||||
launchParamsList[i].sharedMem = 0;
|
||||
launchParamsList[i].stream = stream[i];
|
||||
launchParamsList[i].args = &args[i * NumKernelArgs];
|
||||
}
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
|
||||
|
||||
// Copy result from device to host
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
HIPCHECK(hipMemcpy(numGridsTestH[i], numGridsTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
// Validate results
|
||||
int gridsSeen[MaxGPUs];
|
||||
for (int i = 0; i < nGpu; ++i) {
|
||||
for (int j = 0; j < 2 * blockSize; ++j) {
|
||||
ASSERT_EQUAL(numGridsTestH[i][j], nGpu);
|
||||
ASSERT_GE(gridRankTestH[i][j], 0);
|
||||
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
|
||||
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
|
||||
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
|
||||
int gridRank = gridRankTestH[i][j];
|
||||
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
|
||||
ASSERT_EQUAL(isValidTestH[i][j], 1);
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
|
||||
|
||||
// Validate uniqueness property of grid rank
|
||||
gridsSeen[i] = gridRankTestH[i][0];
|
||||
for (int k = 0; k < i; ++k) {
|
||||
if (gridsSeen[k] == gridsSeen[i]) {
|
||||
assert(false && "Grid rank in multi-gpu setup should be unique");
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
|
||||
|
||||
// Free host and device memory
|
||||
delete [] launchParamsList;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
HIPCHECK(hipFree(numGridsTestD[i]));
|
||||
HIPCHECK(hipFree(gridRankTestD[i]));
|
||||
HIPCHECK(hipFree(sizeTestD[i]));
|
||||
HIPCHECK(hipFree(thdRankTestD[i]));
|
||||
HIPCHECK(hipFree(isValidTestD[i]));
|
||||
HIPCHECK(hipFree(syncTestD[i]));
|
||||
|
||||
if (i == 0) {
|
||||
HIPCHECK(hipHostFree(syncResultD));
|
||||
}
|
||||
HIPCHECK(hipHostFree(numGridsTestH[i]));
|
||||
HIPCHECK(hipHostFree(gridRankTestH[i]));
|
||||
HIPCHECK(hipHostFree(sizeTestH[i]));
|
||||
HIPCHECK(hipHostFree(thdRankTestH[i]));
|
||||
HIPCHECK(hipHostFree(isValidTestH[i]));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGMultiGridGroupType") {
|
||||
int nGpu = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&nGpu));
|
||||
nGpu = min(nGpu, MaxGPUs);
|
||||
|
||||
// Set `maxThreadsPerBlock` by taking minimum among all available devices
|
||||
int maxThreadsPerBlock = INT_MAX;
|
||||
hipDeviceProp_t deviceProperties;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
|
||||
if (!deviceProperties.cooperativeMultiDeviceLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
|
||||
}
|
||||
|
||||
// Test for blockSizes in powers of 2
|
||||
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
|
||||
test_cg_multi_grid_group_type(blockSize, nGpu);
|
||||
}
|
||||
|
||||
// Test for random blockSizes, but the sequence is the same every execution
|
||||
srand(0);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
// Test fails for 0 thread per block
|
||||
test_cg_multi_grid_group_type(max(2, rand() % maxThreadsPerBlock), nGpu);
|
||||
}
|
||||
}
|
||||
@@ -1,234 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <climits>
|
||||
|
||||
#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
|
||||
#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
|
||||
#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
|
||||
|
||||
using namespace cooperative_groups;
|
||||
constexpr int MaxGPUs = 8;
|
||||
|
||||
static __global__
|
||||
void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD,
|
||||
int* gridRankTestD,
|
||||
int *thdRankTestD,
|
||||
int *isValidTestD,
|
||||
int *syncTestD,
|
||||
int *syncResultD)
|
||||
{
|
||||
thread_group tg = this_multi_grid(); // This can work if _CG_ABI_EXPERIMENTAL defined on Cuda
|
||||
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test size
|
||||
sizeTestD[gIdx] = tg.size();
|
||||
|
||||
// Test thread_rank
|
||||
gridRankTestD[gIdx] = this_multi_grid().grid_rank();
|
||||
thdRankTestD[gIdx] = tg.thread_rank();
|
||||
|
||||
// Test is_valid
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
isValidTestD[gIdx] = tg.is_valid();
|
||||
#else
|
||||
// Cuda has no thread_group.is_valid()
|
||||
isValidTestD[gIdx] = true;
|
||||
#endif
|
||||
// Test sync
|
||||
//
|
||||
// Eech thread assign 1 to their respective location
|
||||
syncTestD[gIdx] = 1;
|
||||
// Grid level sync
|
||||
this_grid().sync();
|
||||
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
|
||||
if (blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
|
||||
syncTestD[0] += syncTestD[i];
|
||||
}
|
||||
syncResultD[this_multi_grid().grid_rank() + 1] = syncTestD[0];
|
||||
}
|
||||
// multi-grid level sync
|
||||
tg.sync();
|
||||
// grid (gpu) 0 does final reduction across all grids (gpus)
|
||||
if (this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
syncResultD[0] = 0;
|
||||
for (uint i = 1; i <= this_multi_grid().num_grids(); ++i) {
|
||||
syncResultD[0] += syncResultD[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void test_cg_multi_grid_group_type_via_base_type(int blockSize, int nGpu)
|
||||
{
|
||||
// Create a stream each device
|
||||
hipStream_t stream[MaxGPUs];
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device
|
||||
HIPCHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
|
||||
// Allocate host and device memory
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
|
||||
int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
|
||||
int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
|
||||
int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
|
||||
int *syncTestD[MaxGPUs], *syncResultD;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
|
||||
|
||||
HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
|
||||
|
||||
if (i == 0) {
|
||||
HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
|
||||
}
|
||||
}
|
||||
|
||||
// Launch Kernel
|
||||
constexpr int NumKernelArgs = 6;
|
||||
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
|
||||
void* args[MaxGPUs * NumKernelArgs];
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
args[i * NumKernelArgs ] = &sizeTestD[i];
|
||||
args[i * NumKernelArgs + 1] = &gridRankTestD[i];
|
||||
args[i * NumKernelArgs + 2] = &thdRankTestD[i];
|
||||
args[i * NumKernelArgs + 3] = &isValidTestD[i];
|
||||
args[i * NumKernelArgs + 4] = &syncTestD[i];
|
||||
args[i * NumKernelArgs + 5] = &syncResultD;
|
||||
|
||||
launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_base_type);
|
||||
launchParamsList[i].gridDim = 2;
|
||||
launchParamsList[i].blockDim = blockSize;
|
||||
launchParamsList[i].sharedMem = 0;
|
||||
launchParamsList[i].stream = stream[i];
|
||||
launchParamsList[i].args = &args[i * NumKernelArgs];
|
||||
}
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
|
||||
|
||||
// Copy result from device to host
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
// Validate results
|
||||
int gridsSeen[MaxGPUs];
|
||||
for (int i = 0; i < nGpu; ++i) {
|
||||
for (int j = 0; j < 2 * blockSize; ++j) {
|
||||
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
|
||||
ASSERT_GE(gridRankTestH[i][j], 0);
|
||||
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
|
||||
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
|
||||
int gridRank = gridRankTestH[i][j];
|
||||
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
|
||||
ASSERT_EQUAL(isValidTestH[i][j], 1);
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
|
||||
|
||||
// Validate uniqueness property of grid rank
|
||||
gridsSeen[i] = gridRankTestH[i][0];
|
||||
for (int k = 0; k < i; ++k) {
|
||||
if (gridsSeen[k] == gridsSeen[i]) {
|
||||
assert (false && "Grid rank in multi-gpu setup should be unique");
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
|
||||
|
||||
// Free host and device memory
|
||||
delete [] launchParamsList;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
HIPCHECK(hipFree(sizeTestD[i]));
|
||||
HIPCHECK(hipFree(gridRankTestD[i]));
|
||||
HIPCHECK(hipFree(thdRankTestD[i]));
|
||||
HIPCHECK(hipFree(isValidTestD[i]));
|
||||
HIPCHECK(hipFree(syncTestD[i]));
|
||||
|
||||
if (i == 0)
|
||||
HIPCHECK(hipHostFree(syncResultD));
|
||||
|
||||
HIPCHECK(hipHostFree(sizeTestH[i]));
|
||||
HIPCHECK(hipHostFree(gridRankTestH[i]));
|
||||
HIPCHECK(hipHostFree(thdRankTestH[i]));
|
||||
HIPCHECK(hipHostFree(isValidTestH[i]));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGMultiGridGroupType_BaseType") {
|
||||
// Set `maxThreadsPerBlock` by taking minimum among all available devices
|
||||
int nGpu = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&nGpu));
|
||||
nGpu = min(nGpu, MaxGPUs);
|
||||
|
||||
int maxThreadsPerBlock = INT_MAX;
|
||||
hipDeviceProp_t deviceProperties;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
|
||||
if (!deviceProperties.cooperativeMultiDeviceLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
|
||||
}
|
||||
|
||||
// Test for blockSizes in powers of 2
|
||||
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
|
||||
test_cg_multi_grid_group_type_via_base_type(blockSize, nGpu);
|
||||
}
|
||||
|
||||
// Test for random blockSizes, but the sequence is the same every execution
|
||||
srand(0);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
// Test fails for 0 thread per block
|
||||
test_cg_multi_grid_group_type_via_base_type(max(2, rand() % maxThreadsPerBlock), nGpu);
|
||||
}
|
||||
}
|
||||
@@ -1,230 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <climits>
|
||||
|
||||
#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
|
||||
#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
|
||||
#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
|
||||
|
||||
using namespace cooperative_groups;
|
||||
constexpr int MaxGPUs = 8;
|
||||
|
||||
static __global__
|
||||
void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD,
|
||||
int* gridRankTestD,
|
||||
int *thdRankTestD,
|
||||
int *isValidTestD,
|
||||
int *syncTestD,
|
||||
int *syncResultD)
|
||||
{
|
||||
multi_grid_group mg = this_multi_grid();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test group_size api
|
||||
sizeTestD[gIdx] = group_size(mg);
|
||||
|
||||
// Test thread_rank api
|
||||
gridRankTestD[gIdx] = this_multi_grid().grid_rank();
|
||||
thdRankTestD[gIdx] = thread_rank(mg);
|
||||
|
||||
// Test is_valid api
|
||||
isValidTestD[gIdx] = mg.is_valid();
|
||||
|
||||
// Test sync api
|
||||
//
|
||||
// Eech thread assign 1 to their respective location
|
||||
syncTestD[gIdx] = 1;
|
||||
// Grid level sync
|
||||
sync(this_grid());
|
||||
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
|
||||
if (blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
|
||||
syncTestD[0] += syncTestD[i];
|
||||
}
|
||||
syncResultD[this_multi_grid().grid_rank() + 1] = syncTestD[0];
|
||||
}
|
||||
// multi-grid level sync via public api
|
||||
sync(mg);
|
||||
// grid (gpu) 0 does final reduction across all grids (gpus)
|
||||
if (this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
syncResultD[0] = 0;
|
||||
for (uint i = 1; i <= this_multi_grid().num_grids(); ++i) {
|
||||
syncResultD[0] += syncResultD[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void test_cg_multi_grid_group_type_via_public_api(int blockSize, int nGpu)
|
||||
{
|
||||
// Create a stream each device
|
||||
hipStream_t stream[MaxGPUs];
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device
|
||||
HIPCHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
|
||||
// Allocate host and device memory
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
|
||||
int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
|
||||
int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
|
||||
int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
|
||||
int *syncTestD[MaxGPUs], *syncResultD;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
|
||||
HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
|
||||
|
||||
HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
|
||||
HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
|
||||
|
||||
if (i == 0) {
|
||||
HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
|
||||
}
|
||||
}
|
||||
|
||||
// Launch Kernel
|
||||
constexpr int NumKernelArgs = 6;
|
||||
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
|
||||
void* args[MaxGPUs * NumKernelArgs];
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
args[i * NumKernelArgs ] = &sizeTestD[i];
|
||||
args[i * NumKernelArgs + 1] = &gridRankTestD[i];
|
||||
args[i * NumKernelArgs + 2] = &thdRankTestD[i];
|
||||
args[i * NumKernelArgs + 3] = &isValidTestD[i];
|
||||
args[i * NumKernelArgs + 4] = &syncTestD[i];
|
||||
args[i * NumKernelArgs + 5] = &syncResultD;
|
||||
|
||||
launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_public_api);
|
||||
launchParamsList[i].gridDim = 2;
|
||||
launchParamsList[i].blockDim = blockSize;
|
||||
launchParamsList[i].sharedMem = 0;
|
||||
launchParamsList[i].stream = stream[i];
|
||||
launchParamsList[i].args = &args[i * NumKernelArgs];
|
||||
}
|
||||
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
|
||||
|
||||
// Copy result from device to host
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
// Validate results
|
||||
int gridsSeen[MaxGPUs];
|
||||
for (int i = 0; i < nGpu; ++i) {
|
||||
for (int j = 0; j < 2 * blockSize; ++j) {
|
||||
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
|
||||
ASSERT_GE(gridRankTestH[i][j], 0);
|
||||
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
|
||||
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
|
||||
int gridRank = gridRankTestH[i][j];
|
||||
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
|
||||
ASSERT_EQUAL(isValidTestH[i][j], 1);
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
|
||||
|
||||
// Validate uniqueness property of grid rank
|
||||
gridsSeen[i] = gridRankTestH[i][0];
|
||||
for (int k = 0; k < i; ++k) {
|
||||
if (gridsSeen[k] == gridsSeen[i]) {
|
||||
assert (false && "Grid rank in multi-gpu setup should be unique");
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
|
||||
|
||||
// Free host and device memory
|
||||
delete [] launchParamsList;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipSetDevice(i));
|
||||
|
||||
HIPCHECK(hipFree(sizeTestD[i]));
|
||||
HIPCHECK(hipFree(gridRankTestD[i]));
|
||||
HIPCHECK(hipFree(thdRankTestD[i]));
|
||||
HIPCHECK(hipFree(isValidTestD[i]));
|
||||
HIPCHECK(hipFree(syncTestD[i]));
|
||||
|
||||
if (i == 0)
|
||||
HIPCHECK(hipHostFree(syncResultD));
|
||||
|
||||
HIPCHECK(hipHostFree(sizeTestH[i]));
|
||||
HIPCHECK(hipHostFree(gridRankTestH[i]));
|
||||
HIPCHECK(hipHostFree(thdRankTestH[i]));
|
||||
HIPCHECK(hipHostFree(isValidTestH[i]));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGMultiGridGroupType_PublicApi") {
|
||||
// Set `maxThreadsPerBlock` by taking minimum among all available devices
|
||||
int nGpu = 0;
|
||||
HIPCHECK(hipGetDeviceCount(&nGpu));
|
||||
nGpu = min(nGpu, MaxGPUs);
|
||||
|
||||
int maxThreadsPerBlock = INT_MAX;
|
||||
hipDeviceProp_t deviceProperties;
|
||||
for (int i = 0; i < nGpu; i++) {
|
||||
HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
|
||||
if (!deviceProperties.cooperativeMultiDeviceLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
|
||||
}
|
||||
|
||||
// Test for blockSizes in powers of 2
|
||||
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
|
||||
test_cg_multi_grid_group_type_via_public_api(blockSize, nGpu);
|
||||
}
|
||||
|
||||
// Test for random blockSizes, but the sequence is the same every execution
|
||||
srand(0);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
// Test fails for 0 thread per block
|
||||
test_cg_multi_grid_group_type_via_public_api(max(2, rand() % maxThreadsPerBlock), nGpu);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,638 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
|
||||
#include "hip_cg_common.hh"
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
static __global__ void kernel_cg_multi_grid_group_type(int* grid_rank_dev, int* size_dev,
|
||||
int* thd_rank_dev, int* is_valid_dev,
|
||||
int* sync_dev, int* sync_result,
|
||||
int* num_grids_dev) {
|
||||
cg::multi_grid_group mg = cg::this_multi_grid();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test num_grids
|
||||
num_grids_dev[gIdx] = mg.num_grids();
|
||||
|
||||
// Test grid_rank
|
||||
grid_rank_dev[gIdx] = mg.grid_rank();
|
||||
|
||||
// Test size
|
||||
size_dev[gIdx] = mg.size();
|
||||
|
||||
// Test thread_rank
|
||||
thd_rank_dev[gIdx] = mg.thread_rank();
|
||||
|
||||
// Test is_valid
|
||||
is_valid_dev[gIdx] = mg.is_valid();
|
||||
|
||||
// Test sync
|
||||
//
|
||||
// Eech thread assign 1 to their respective location
|
||||
sync_dev[gIdx] = 1;
|
||||
// Grid level sync
|
||||
cg::this_grid().sync();
|
||||
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
|
||||
if (blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
|
||||
sync_dev[0] += sync_dev[i];
|
||||
}
|
||||
sync_result[mg.grid_rank() + 1] = sync_dev[0];
|
||||
}
|
||||
// multi-grid level sync
|
||||
mg.sync();
|
||||
// grid (gpu) 0 does final reduction across all grids (gpus)
|
||||
if (mg.grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
sync_result[0] = 0;
|
||||
for (uint i = 1; i <= mg.num_grids(); ++i) {
|
||||
sync_result[0] += sync_result[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void kernel_cg_multi_grid_group_type_via_base_type(
|
||||
int* grid_rank_dev, int* size_dev, int* thd_rank_dev, int* is_valid_dev, int* sync_dev,
|
||||
int* sync_result) {
|
||||
cg::thread_group tg =
|
||||
cg::this_multi_grid(); // This can work if _CG_ABI_EXPERIMENTAL defined on Cuda
|
||||
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test size
|
||||
size_dev[gIdx] = tg.size();
|
||||
|
||||
// Test thread_rank
|
||||
grid_rank_dev[gIdx] = cg::this_multi_grid().grid_rank();
|
||||
thd_rank_dev[gIdx] = tg.thread_rank();
|
||||
|
||||
// Test is_valid
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
is_valid_dev[gIdx] = tg.is_valid();
|
||||
#else
|
||||
// Cuda has no thread_group.is_valid()
|
||||
is_valid_dev[gIdx] = true;
|
||||
#endif
|
||||
// Test sync
|
||||
//
|
||||
// Eech thread assign 1 to their respective location
|
||||
sync_dev[gIdx] = 1;
|
||||
// Grid level sync
|
||||
cg::this_grid().sync();
|
||||
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
|
||||
if (blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
|
||||
sync_dev[0] += sync_dev[i];
|
||||
}
|
||||
sync_result[cg::this_multi_grid().grid_rank() + 1] = sync_dev[0];
|
||||
}
|
||||
// multi-grid level sync
|
||||
tg.sync();
|
||||
// grid (gpu) 0 does final reduction across all grids (gpus)
|
||||
if (cg::this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
sync_result[0] = 0;
|
||||
for (uint i = 1; i <= cg::this_multi_grid().num_grids(); ++i) {
|
||||
sync_result[0] += sync_result[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void kernel_cg_multi_grid_group_type_via_public_api(
|
||||
int* grid_rank_dev, int* size_dev, int* thd_rank_dev, int* is_valid_dev, int* sync_dev,
|
||||
int* sync_result) {
|
||||
cg::multi_grid_group mg = cg::this_multi_grid();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test group_size api
|
||||
size_dev[gIdx] = cg::group_size(mg);
|
||||
|
||||
// Test thread_rank api
|
||||
grid_rank_dev[gIdx] = cg::this_multi_grid().grid_rank();
|
||||
thd_rank_dev[gIdx] = cg::thread_rank(mg);
|
||||
|
||||
// Test is_valid api
|
||||
is_valid_dev[gIdx] = mg.is_valid();
|
||||
|
||||
// Test sync api
|
||||
//
|
||||
// Eech thread assign 1 to their respective location
|
||||
sync_dev[gIdx] = 1;
|
||||
// Grid level sync
|
||||
cg::sync(cg::this_grid());
|
||||
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
|
||||
if (blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
|
||||
sync_dev[0] += sync_dev[i];
|
||||
}
|
||||
sync_result[cg::this_multi_grid().grid_rank() + 1] = sync_dev[0];
|
||||
}
|
||||
// multi-grid level sync via public api
|
||||
cg::sync(mg);
|
||||
// grid (gpu) 0 does final reduction across all grids (gpus)
|
||||
if (cg::this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
|
||||
sync_result[0] = 0;
|
||||
for (uint i = 1; i <= cg::this_multi_grid().num_grids(); ++i) {
|
||||
sync_result[0] += sync_result[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void test_kernel(unsigned int* atomic_val, unsigned int* global_array,
|
||||
unsigned int* array, uint32_t loops) {
|
||||
cg::grid_group grid = cg::this_grid();
|
||||
cg::multi_grid_group mgrid = cg::this_multi_grid();
|
||||
unsigned rank = grid.thread_rank();
|
||||
unsigned global_rank = mgrid.thread_rank();
|
||||
|
||||
int offset = blockIdx.x;
|
||||
for (int i = 0; i < loops; i++) {
|
||||
// Make the last thread run way behind everyone else.
|
||||
// If the grid barrier below fails, then the other threads may hit the
|
||||
// atomicInc instruction many times before the last thread ever gets
|
||||
// to it.
|
||||
// As such, without the barrier, the last array entry will eventually
|
||||
// contain a very large value, defined by however many times the other
|
||||
// wavefronts make it through this loop.
|
||||
// If the barrier works, then it will likely contain some number
|
||||
// near "total number of blocks". It will be the last wavefront to
|
||||
// reach the atomicInc, but everyone will have only hit the atomic once.
|
||||
if (rank == (grid.size() - 1)) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = clock64();
|
||||
do {
|
||||
long long cur_clock = clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < 1000000);
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
array[offset] = atomicInc(atomic_val, UINT_MAX);
|
||||
}
|
||||
grid.sync();
|
||||
|
||||
// Make the last thread in the entire multi-grid run way behind
|
||||
// everyone else.
|
||||
// If the mgrid barrier below fails, then the two global_array entries
|
||||
// will end up being out of sync, because the intermingling of adds
|
||||
// and multiplies will not be aligned between to the two GPUs.
|
||||
if (global_rank == (mgrid.size() - 1)) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = clock64();
|
||||
do {
|
||||
long long cur_clock = clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < 1000000);
|
||||
}
|
||||
// During even iterations, add into your own array entry
|
||||
// During odd iterations, add into your partner's array entry
|
||||
unsigned grid_rank = mgrid.grid_rank();
|
||||
unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
|
||||
if (rank == (grid.size() - 1)) {
|
||||
if (i % mgrid.num_grids() == 0) {
|
||||
global_array[grid_rank] += 2;
|
||||
} else {
|
||||
global_array[inter_gpu_offset] *= 2;
|
||||
}
|
||||
}
|
||||
mgrid.sync();
|
||||
offset += gridDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void test_kernel_gfx11(unsigned int* atomic_val, unsigned int* global_array,
|
||||
unsigned int* array, uint32_t loops) {
|
||||
#if HT_AMD
|
||||
cg::grid_group grid = cg::this_grid();
|
||||
cg::multi_grid_group mgrid = cg::this_multi_grid();
|
||||
unsigned rank = grid.thread_rank();
|
||||
unsigned global_rank = mgrid.thread_rank();
|
||||
|
||||
int offset = blockIdx.x;
|
||||
for (int i = 0; i < loops; i++) {
|
||||
// Make the last thread run way behind everyone else.
|
||||
// If the grid barrier below fails, then the other threads may hit the
|
||||
// atomicInc instruction many times before the last thread ever gets
|
||||
// to it.
|
||||
// As such, without the barrier, the last array entry will eventually
|
||||
// contain a very large value, defined by however many times the other
|
||||
// wavefronts make it through this loop.
|
||||
// If the barrier works, then it will likely contain some number
|
||||
// near "total number of blocks". It will be the last wavefront to
|
||||
// reach the atomicInc, but everyone will have only hit the atomic once.
|
||||
if (rank == (grid.size() - 1)) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = wall_clock64();
|
||||
do {
|
||||
long long cur_clock = wall_clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < 1000000);
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
array[offset] = atomicInc(atomic_val, UINT_MAX);
|
||||
}
|
||||
grid.sync();
|
||||
|
||||
// Make the last thread in the entire multi-grid run way behind
|
||||
// everyone else.
|
||||
// If the mgrid barrier below fails, then the two global_array entries
|
||||
// will end up being out of sync, because the intermingling of adds
|
||||
// and multiplies will not be aligned between to the two GPUs.
|
||||
if (global_rank == (mgrid.size() - 1)) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = wall_clock64();
|
||||
do {
|
||||
long long cur_clock = wall_clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < 1000000);
|
||||
}
|
||||
// During even iterations, add into your own array entry
|
||||
// During odd iterations, add into your partner's array entry
|
||||
unsigned grid_rank = mgrid.grid_rank();
|
||||
unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
|
||||
if (rank == (grid.size() - 1)) {
|
||||
if (i % mgrid.num_grids() == 0) {
|
||||
global_array[grid_rank] += 2;
|
||||
} else {
|
||||
global_array[inter_gpu_offset] *= 2;
|
||||
}
|
||||
}
|
||||
mgrid.sync();
|
||||
offset += gridDim.x;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void verify_barrier_buffer(unsigned int loops, unsigned int warps, unsigned int* host_buffer,
|
||||
unsigned int num_devs) {
|
||||
unsigned int max_in_this_loop = 0;
|
||||
for (unsigned int i = 0; i < loops; i++) {
|
||||
max_in_this_loop += (warps * num_devs);
|
||||
for (unsigned int j = 0; j < warps; j++) {
|
||||
REQUIRE(host_buffer[i * warps + j] <= max_in_this_loop);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) {
|
||||
unsigned int desired_val = 0;
|
||||
for (int i = 0; i < loops; i++) {
|
||||
if (i % 2 == 0) {
|
||||
desired_val += 2;
|
||||
} else {
|
||||
desired_val *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
REQUIRE(array_val == desired_val);
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
static void test_cg_multi_grid_group_type(F kernel_func, int num_devices, int block_size,
|
||||
bool specific_api_test) {
|
||||
// Create a stream each device
|
||||
hipStream_t stream[MaxGPUs];
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipDeviceSynchronize()); // Make sure work is done on this device
|
||||
HIP_CHECK(hipStreamCreate(&stream[i]));
|
||||
}
|
||||
|
||||
// Allocate host and device memory
|
||||
int num_bytes = sizeof(int) * 2 * block_size;
|
||||
int *num_grids_dev[MaxGPUs], *num_grids_host[MaxGPUs];
|
||||
int *grid_rank_dev[MaxGPUs], *grid_rank_host[MaxGPUs];
|
||||
int *size_dev[MaxGPUs], *size_host[MaxGPUs];
|
||||
int *thd_rank_dev[MaxGPUs], *thd_rank_host[MaxGPUs];
|
||||
int *is_valid_dev[MaxGPUs], *is_valid_host[MaxGPUs];
|
||||
int *sync_dev[MaxGPUs], *sync_result;
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
|
||||
if (specific_api_test) {
|
||||
HIP_CHECK(hipMalloc(&num_grids_dev[i], num_bytes));
|
||||
HIP_CHECK(hipHostMalloc(&num_grids_host[i], num_bytes));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMalloc(&grid_rank_dev[i], num_bytes));
|
||||
HIP_CHECK(hipMalloc(&size_dev[i], num_bytes));
|
||||
HIP_CHECK(hipMalloc(&thd_rank_dev[i], num_bytes));
|
||||
HIP_CHECK(hipMalloc(&is_valid_dev[i], num_bytes));
|
||||
HIP_CHECK(hipMalloc(&sync_dev[i], num_bytes));
|
||||
|
||||
HIP_CHECK(hipHostMalloc(&grid_rank_host[i], num_bytes));
|
||||
HIP_CHECK(hipHostMalloc(&size_host[i], num_bytes));
|
||||
HIP_CHECK(hipHostMalloc(&thd_rank_host[i], num_bytes));
|
||||
HIP_CHECK(hipHostMalloc(&is_valid_host[i], num_bytes));
|
||||
|
||||
if (i == 0) {
|
||||
HIP_CHECK(
|
||||
hipHostMalloc(&sync_result, sizeof(int) * (num_devices + 1), hipHostMallocCoherent));
|
||||
}
|
||||
}
|
||||
|
||||
// Launch Kernel
|
||||
int NumKernelArgs = 6;
|
||||
if (specific_api_test) {
|
||||
NumKernelArgs = 7;
|
||||
}
|
||||
hipLaunchParams* launchParamsList = new hipLaunchParams[num_devices];
|
||||
std::vector<void*> args(MaxGPUs * NumKernelArgs);
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
|
||||
args[i * NumKernelArgs] = &grid_rank_dev[i];
|
||||
args[i * NumKernelArgs + 1] = &size_dev[i];
|
||||
args[i * NumKernelArgs + 2] = &thd_rank_dev[i];
|
||||
args[i * NumKernelArgs + 3] = &is_valid_dev[i];
|
||||
args[i * NumKernelArgs + 4] = &sync_dev[i];
|
||||
args[i * NumKernelArgs + 5] = &sync_result;
|
||||
if (specific_api_test) {
|
||||
args[i * NumKernelArgs + 6] = &num_grids_dev[i];
|
||||
}
|
||||
|
||||
launchParamsList[i].func = reinterpret_cast<void*>(kernel_func);
|
||||
launchParamsList[i].gridDim = 2;
|
||||
launchParamsList[i].blockDim = block_size;
|
||||
launchParamsList[i].sharedMem = 0;
|
||||
launchParamsList[i].stream = stream[i];
|
||||
launchParamsList[i].args = &args[i * NumKernelArgs];
|
||||
}
|
||||
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, num_devices, 0));
|
||||
|
||||
// Copy result from device to host
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
if (specific_api_test) {
|
||||
HIP_CHECK(hipMemcpy(num_grids_host[i], num_grids_dev[i], num_bytes, hipMemcpyDeviceToHost));
|
||||
}
|
||||
HIP_CHECK(hipMemcpy(grid_rank_host[i], grid_rank_dev[i], num_bytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(size_host[i], size_dev[i], num_bytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(thd_rank_host[i], thd_rank_dev[i], num_bytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(is_valid_host[i], is_valid_dev[i], num_bytes, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
// Validate results
|
||||
int grids_seen[MaxGPUs];
|
||||
for (int i = 0; i < num_devices; ++i) {
|
||||
for (int j = 0; j < 2 * block_size; ++j) {
|
||||
if (specific_api_test) {
|
||||
ASSERT_EQUAL(num_grids_host[i][j], num_devices);
|
||||
}
|
||||
ASSERT_GE(grid_rank_host[i][j], 0);
|
||||
ASSERT_LE(grid_rank_host[i][j], num_devices - 1);
|
||||
ASSERT_EQUAL(grid_rank_host[i][j], grid_rank_host[i][0]);
|
||||
ASSERT_EQUAL(size_host[i][j], num_devices * 2 * block_size);
|
||||
int gridRank = grid_rank_host[i][j];
|
||||
ASSERT_EQUAL(thd_rank_host[i][j], (gridRank * 2 * block_size) + j);
|
||||
ASSERT_EQUAL(is_valid_host[i][j], 1);
|
||||
}
|
||||
ASSERT_EQUAL(sync_result[i + 1], 2 * block_size);
|
||||
|
||||
// Validate uniqueness property of grid rank
|
||||
grids_seen[i] = grid_rank_host[i][0];
|
||||
for (int k = 0; k < i; ++k) {
|
||||
INFO("Grid rank in multi-gpu setup should be unique");
|
||||
REQUIRE(grids_seen[k] != grids_seen[i]);
|
||||
}
|
||||
}
|
||||
ASSERT_EQUAL(sync_result[0], num_devices * 2 * block_size);
|
||||
|
||||
// Free host and device memory
|
||||
delete[] launchParamsList;
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
|
||||
if (specific_api_test) {
|
||||
HIP_CHECK(hipFree(num_grids_dev[i]));
|
||||
HIP_CHECK(hipHostFree(num_grids_host[i]));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipFree(grid_rank_dev[i]));
|
||||
HIP_CHECK(hipFree(size_dev[i]));
|
||||
HIP_CHECK(hipFree(thd_rank_dev[i]));
|
||||
HIP_CHECK(hipFree(is_valid_dev[i]));
|
||||
HIP_CHECK(hipFree(sync_dev[i]));
|
||||
|
||||
if (i == 0) {
|
||||
HIP_CHECK(hipHostFree(sync_result));
|
||||
}
|
||||
HIP_CHECK(hipHostFree(grid_rank_host[i]));
|
||||
HIP_CHECK(hipHostFree(size_host[i]));
|
||||
HIP_CHECK(hipHostFree(thd_rank_host[i]));
|
||||
HIP_CHECK(hipHostFree(is_valid_host[i]));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGMultiGridGroupType_Basic") {
|
||||
int num_devices = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&num_devices));
|
||||
num_devices = min(num_devices, MaxGPUs);
|
||||
|
||||
// Set `max_threads_per_blk` by taking minimum among all available devices
|
||||
int max_threads_per_blk = INT_MAX;
|
||||
hipDeviceProp_t device_properties;
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, i));
|
||||
if (!device_properties.cooperativeMultiDeviceLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
max_threads_per_blk = min(max_threads_per_blk, device_properties.maxThreadsPerBlock);
|
||||
}
|
||||
|
||||
void* (*kernel_func)(void);
|
||||
bool specific_api_test = false;
|
||||
|
||||
SECTION("Default multi grid group API test") {
|
||||
kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_multi_grid_group_type);
|
||||
specific_api_test = true;
|
||||
}
|
||||
|
||||
SECTION("Base type multi grid group API test") {
|
||||
kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_multi_grid_group_type_via_base_type);
|
||||
}
|
||||
|
||||
SECTION("Public API multi grid group test") {
|
||||
kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_multi_grid_group_type_via_public_api);
|
||||
}
|
||||
|
||||
// Test for blockSizes in powers of 2
|
||||
for (int block_size = 2; block_size <= max_threads_per_blk; block_size = block_size * 2) {
|
||||
test_cg_multi_grid_group_type(kernel_func, num_devices, block_size, specific_api_test);
|
||||
}
|
||||
|
||||
// Test for random blockSizes, but the sequence is the same every execution
|
||||
srand(0);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
// Test fails for 0 thread per block
|
||||
test_cg_multi_grid_group_type(kernel_func, num_devices, max(2, rand() % max_threads_per_blk),
|
||||
specific_api_test);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGMultiGridGroupType_Barrier") {
|
||||
int num_devices = 0;
|
||||
uint32_t loops = GENERATE(1, 2, 3, 4);
|
||||
uint32_t warps = GENERATE(4, 8, 16, 32);
|
||||
uint32_t block_size = 1;
|
||||
|
||||
HIP_CHECK(hipGetDeviceCount(&num_devices));
|
||||
if (num_devices < 2) {
|
||||
HipTest::HIP_SKIP_TEST("Device number is < 2");
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<hipDeviceProp_t> device_properties(num_devices);
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties[i], i));
|
||||
if (!device_properties[i].cooperativeMultiDeviceLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Test whether the requested size will fit on the GPU
|
||||
std::vector<int> warp_sizes(num_devices);
|
||||
std::vector<int> num_sms(num_devices);
|
||||
int warp_size = INT_MAX;
|
||||
int num_sm = INT_MAX;
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
warp_sizes[i] = device_properties[i].warpSize;
|
||||
if (warp_sizes[i] < warp_size) {
|
||||
warp_size = warp_sizes[i];
|
||||
}
|
||||
num_sms[i] = device_properties[i].multiProcessorCount;
|
||||
if (num_sms[i] < num_sm) {
|
||||
num_sm = num_sms[i];
|
||||
}
|
||||
}
|
||||
|
||||
int num_threads_in_block = block_size * warp_size;
|
||||
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
std::vector<int> max_blocks_per_sm_arr(num_devices);
|
||||
int max_blocks_per_sm = INT_MAX;
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm_arr[i], test_kernel_used, num_threads_in_block, 0));
|
||||
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
|
||||
max_blocks_per_sm = max_blocks_per_sm_arr[i];
|
||||
}
|
||||
}
|
||||
|
||||
int requested_blocks = warps / block_size;
|
||||
|
||||
// Each block will output a single value per loop.
|
||||
uint32_t total_buffer_len = requested_blocks * loops;
|
||||
|
||||
// Alocate the buffer that will hold the kernel's output, and which will
|
||||
// also be used to globally synchronize during GWS initialization
|
||||
std::vector<unsigned int*> host_buffer(num_devices);
|
||||
std::vector<unsigned int*> kernel_buffer(num_devices);
|
||||
std::vector<unsigned int*> kernel_atomic(num_devices);
|
||||
std::vector<hipStream_t> streams(num_devices);
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
host_buffer[i] =
|
||||
reinterpret_cast<unsigned int*>(calloc(total_buffer_len, sizeof(unsigned int)));
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipMalloc(&kernel_buffer[i], sizeof(unsigned int) * total_buffer_len));
|
||||
HIP_CHECK(hipMemcpy(kernel_buffer[i], host_buffer[i], sizeof(unsigned int) * total_buffer_len,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMalloc(&kernel_atomic[i], sizeof(unsigned int)));
|
||||
HIP_CHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int)));
|
||||
HIP_CHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
// Single kernel atomic shared between both devices; put it on the host
|
||||
unsigned int* global_array;
|
||||
HIP_CHECK(hipHostMalloc(&global_array, sizeof(unsigned int) * num_devices));
|
||||
HIP_CHECK(hipMemset(global_array, 0, num_devices * sizeof(unsigned int)));
|
||||
|
||||
// Launch the kernels
|
||||
INFO("Launching a cooperative kernel with " << warps << " warps in " << requested_blocks
|
||||
<< " thread blocks");
|
||||
|
||||
std::vector<std::vector<void*>> dev_params(num_devices, std::vector<void*>(4, nullptr));
|
||||
std::vector<hipLaunchParams> md_params(num_devices);
|
||||
for (int i = 0; i < num_devices; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
|
||||
dev_params[i][1] = reinterpret_cast<void*>(&global_array);
|
||||
dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
|
||||
dev_params[i][3] = reinterpret_cast<void*>(&loops);
|
||||
md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
|
||||
md_params[i].gridDim = requested_blocks;
|
||||
md_params[i].blockDim = num_threads_in_block;
|
||||
md_params[i].sharedMem = 0;
|
||||
md_params[i].stream = streams[i];
|
||||
md_params[i].args = dev_params[i].data();
|
||||
}
|
||||
|
||||
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params.data(), num_devices, 0));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
// Read back the buffer to host
|
||||
for (int dev = 0; dev < num_devices; dev++) {
|
||||
HIP_CHECK(hipMemcpy(host_buffer[dev], kernel_buffer[dev],
|
||||
sizeof(unsigned int) * total_buffer_len, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
for (unsigned int dev = 0; dev < num_devices; dev++) {
|
||||
verify_barrier_buffer(loops, requested_blocks, host_buffer[dev], num_devices);
|
||||
}
|
||||
|
||||
for (int dev = 0; dev < num_devices; dev++) {
|
||||
verify_multi_gpu_buffer(loops, global_array[dev]);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipHostFree(global_array));
|
||||
for (int k = 0; k < num_devices; ++k) {
|
||||
HIP_CHECK(hipFree(kernel_buffer[k]));
|
||||
HIP_CHECK(hipFree(kernel_atomic[k]));
|
||||
HIP_CHECK(hipStreamDestroy(streams[k]));
|
||||
free(host_buffer[k]);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,198 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
|
||||
#include "hip_cg_common.hh"
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
enum class TiledGroupShflTests { shflDown, shflXor, shflUp };
|
||||
|
||||
template <unsigned int tileSz>
|
||||
__device__ int reduction_kernel_shfl_down(cg::thread_block_tile<tileSz> const& g,
|
||||
volatile int val) {
|
||||
int sz = g.size();
|
||||
|
||||
for (int i = sz / 2; i > 0; i >>= 1) {
|
||||
val += g.shfl_down(val, i);
|
||||
}
|
||||
|
||||
// Choose the 0'th indexed thread that holds the reduction value to return
|
||||
if (g.thread_rank() == 0) {
|
||||
return val;
|
||||
}
|
||||
// Rest of the threads return no useful values
|
||||
else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int tileSz>
|
||||
__device__ int reduction_kernel_shfl_xor(cg::thread_block_tile<tileSz> const& g, int val) {
|
||||
int sz = g.size();
|
||||
|
||||
for (int i = sz / 2; i > 0; i >>= 1) {
|
||||
val += g.shfl_xor(val, i);
|
||||
}
|
||||
|
||||
// Choose the 0'th indexed thread that holds the reduction value to return
|
||||
if (g.thread_rank() == 0) {
|
||||
return val;
|
||||
}
|
||||
// Rest of the threads return no useful values
|
||||
else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int tileSz>
|
||||
__device__ int prefix_sum_kernel(cg::thread_block_tile<tileSz> const& g, volatile int val) {
|
||||
int sz = g.size();
|
||||
#pragma unroll
|
||||
for (int i = 1; i < sz; i <<= 1) {
|
||||
int temp = g.shfl_up(val, i);
|
||||
|
||||
if (g.thread_rank() >= i) {
|
||||
val += temp;
|
||||
}
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
template <unsigned int tile_size>
|
||||
static __global__ void kernel_cg_group_partition_static(int* result,
|
||||
TiledGroupShflTests shfl_test) {
|
||||
cg::thread_block thread_block_CG_ty = cg::this_thread_block();
|
||||
int input, output_sum;
|
||||
|
||||
// Choose a leader thread to print the results
|
||||
if (thread_block_CG_ty.thread_rank() == 0) {
|
||||
printf(" Creating %d groups, of tile size %d threads:\n\n",
|
||||
(int)thread_block_CG_ty.size() / tile_size, tile_size);
|
||||
}
|
||||
|
||||
thread_block_CG_ty.sync();
|
||||
|
||||
cg::thread_block_tile<tile_size> tiled_part = cg::tiled_partition<tile_size>(thread_block_CG_ty);
|
||||
|
||||
input = tiled_part.thread_rank();
|
||||
|
||||
switch (shfl_test) {
|
||||
case (TiledGroupShflTests::shflDown):
|
||||
output_sum = reduction_kernel_shfl_down(tiled_part, input);
|
||||
break;
|
||||
case (TiledGroupShflTests::shflXor):
|
||||
output_sum = reduction_kernel_shfl_xor(tiled_part, input);
|
||||
break;
|
||||
case (TiledGroupShflTests::shflUp):
|
||||
output_sum = prefix_sum_kernel(tiled_part, input);
|
||||
result[thread_block_CG_ty.thread_rank()] = output_sum;
|
||||
}
|
||||
|
||||
if (tiled_part.thread_rank() == 0 && shfl_test != TiledGroupShflTests::shflUp) {
|
||||
printf(" Sum of all ranks 0..%d in this tiled_part group is %d\n", tiled_part.size() - 1,
|
||||
output_sum);
|
||||
result[thread_block_CG_ty.thread_rank() / (tile_size)] = output_sum;
|
||||
}
|
||||
}
|
||||
|
||||
static void expected_result_calc(int* expected_result, int tile_size, int size,
|
||||
TiledGroupShflTests shfl_test) {
|
||||
switch (shfl_test) {
|
||||
case (TiledGroupShflTests::shflDown):
|
||||
case (TiledGroupShflTests::shflXor): {
|
||||
int expected_sum = ((tile_size - 1) * tile_size / 2);
|
||||
for (int i = 0; i < size; i++) {
|
||||
expected_result[i] = expected_sum;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case (TiledGroupShflTests::shflUp): {
|
||||
for (int i = 0; i < size / tile_size; i++) {
|
||||
int acc = 0;
|
||||
for (int j = 0; j < tile_size; j++) {
|
||||
acc += j;
|
||||
expected_result[i * tile_size + j] = acc;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int tile_size> static void test_group_partition(TiledGroupShflTests shfl_test) {
|
||||
int block_size = 1;
|
||||
int threads_per_blk = 64;
|
||||
|
||||
int num_elem = (block_size * threads_per_blk) / tile_size;
|
||||
if (shfl_test == TiledGroupShflTests::shflUp) {
|
||||
num_elem = block_size * threads_per_blk;
|
||||
}
|
||||
|
||||
int* expected_result = new int[num_elem];
|
||||
|
||||
int* result_dev = NULL;
|
||||
int* result_host = NULL;
|
||||
|
||||
HIP_CHECK(hipHostMalloc(&result_host, num_elem * sizeof(int), hipHostMallocDefault));
|
||||
memset(result_host, 0, num_elem * sizeof(int));
|
||||
|
||||
HIP_CHECK(hipMalloc(&result_dev, num_elem * sizeof(int)));
|
||||
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_cg_group_partition_static<tile_size>, block_size, threads_per_blk,
|
||||
threads_per_blk * sizeof(int), 0, result_dev, shfl_test);
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
|
||||
HIP_CHECK(hipMemcpy(result_host, result_dev, sizeof(int) * num_elem, hipMemcpyDeviceToHost));
|
||||
|
||||
expected_result_calc(expected_result, tile_size, num_elem, shfl_test);
|
||||
compareResults(expected_result, result_host, num_elem * sizeof(int));
|
||||
|
||||
// Free all allocated memory on host and device
|
||||
HIP_CHECK(hipFree(result_dev));
|
||||
HIP_CHECK(hipHostFree(result_host));
|
||||
delete[] expected_result;
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGThreadBlockTileType_Shfl") {
|
||||
// Use default device for validating the test
|
||||
int device;
|
||||
hipDeviceProp_t device_properties;
|
||||
HIP_CHECK(hipGetDevice(&device));
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
||||
|
||||
if (!device_properties.cooperativeLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
|
||||
TiledGroupShflTests shfl_test = GENERATE(
|
||||
TiledGroupShflTests::shflDown, TiledGroupShflTests::shflXor, TiledGroupShflTests::shflUp);
|
||||
test_group_partition<2>(shfl_test);
|
||||
test_group_partition<4>(shfl_test);
|
||||
test_group_partition<8>(shfl_test);
|
||||
test_group_partition<16>(shfl_test);
|
||||
test_group_partition<32>(shfl_test);
|
||||
}
|
||||
@@ -1,177 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include <cstdlib>
|
||||
|
||||
#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
|
||||
|
||||
using namespace cooperative_groups;
|
||||
|
||||
static __global__
|
||||
void kernel_cg_thread_block_type(int *sizeTestD,
|
||||
int *thdRankTestD,
|
||||
int *syncTestD,
|
||||
dim3 *groupIndexTestD,
|
||||
dim3 *thdIndexTestD,
|
||||
dim3 *groupDimTestD)
|
||||
{
|
||||
thread_block tb = this_thread_block();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
// Test size
|
||||
sizeTestD[gIdx] = tb.size();
|
||||
|
||||
// Test thread_rank
|
||||
thdRankTestD[gIdx] = tb.thread_rank();
|
||||
|
||||
// Test sync
|
||||
__shared__ int sm[2];
|
||||
if (threadIdx.x == 0)
|
||||
sm[0] = 10;
|
||||
else if (threadIdx.x == 1)
|
||||
sm[1] = 20;
|
||||
tb.sync();
|
||||
syncTestD[gIdx] = sm[1] * sm[0];
|
||||
|
||||
// Test group_index
|
||||
groupIndexTestD[gIdx] = tb.group_index();
|
||||
|
||||
// Test thread_index
|
||||
thdIndexTestD[gIdx] = tb.thread_index();
|
||||
|
||||
// Test group_dim aka number of threads in a block
|
||||
groupDimTestD[gIdx] = tb.group_dim();
|
||||
}
|
||||
|
||||
static void test_cg_thread_block_type(int blockSize)
|
||||
{
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int nDim3Bytes = sizeof(dim3) * 2 * blockSize;
|
||||
int *sizeTestD, *sizeTestH;
|
||||
int *thdRankTestD, *thdRankTestH;
|
||||
int *syncTestD, *syncTestH;
|
||||
dim3 *groupIndexTestD, *groupIndexTestH;
|
||||
dim3 *thdIndexTestD, *thdIndexTestH, *groupDimTestD, *groupDimTestH;
|
||||
|
||||
// Allocate device memory
|
||||
HIPCHECK(hipMalloc(&sizeTestD, nBytes));
|
||||
HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
|
||||
HIPCHECK(hipMalloc(&syncTestD, nBytes));
|
||||
HIPCHECK(hipMalloc(&groupIndexTestD, nDim3Bytes));
|
||||
HIPCHECK(hipMalloc(&thdIndexTestD, nDim3Bytes));
|
||||
HIPCHECK(hipMalloc(&groupDimTestD, nDim3Bytes));
|
||||
|
||||
// Allocate host memory
|
||||
HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
|
||||
HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
|
||||
HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
|
||||
HIPCHECK(hipHostMalloc(&groupIndexTestH, nDim3Bytes));
|
||||
HIPCHECK(hipHostMalloc(&thdIndexTestH, nDim3Bytes));
|
||||
HIPCHECK(hipHostMalloc(&groupDimTestH, nDim3Bytes));
|
||||
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_cg_thread_block_type,
|
||||
2,
|
||||
blockSize,
|
||||
0,
|
||||
0,
|
||||
sizeTestD,
|
||||
thdRankTestD,
|
||||
syncTestD,
|
||||
groupIndexTestD,
|
||||
thdIndexTestD,
|
||||
groupDimTestD);
|
||||
|
||||
// Copy result from device to host
|
||||
HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(groupIndexTestH, groupIndexTestD, nDim3Bytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(thdIndexTestH, thdIndexTestD, nDim3Bytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(groupDimTestH, groupDimTestD, nDim3Bytes, hipMemcpyDeviceToHost));
|
||||
|
||||
// Validate results for both blocks together
|
||||
for (int i = 0; i < 2 * blockSize; ++i) {
|
||||
ASSERT_EQUAL(sizeTestH[i], blockSize);
|
||||
ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
|
||||
ASSERT_EQUAL(syncTestH[i], 200);
|
||||
ASSERT_EQUAL(groupIndexTestH[i].x, (uint) i / blockSize);
|
||||
ASSERT_EQUAL(groupIndexTestH[i].y, 0);
|
||||
ASSERT_EQUAL(groupIndexTestH[i].z, 0);
|
||||
ASSERT_EQUAL(thdIndexTestH[i].x, (uint) i % blockSize);
|
||||
ASSERT_EQUAL(thdIndexTestH[i].y, 0);
|
||||
ASSERT_EQUAL(thdIndexTestH[i].z, 0);
|
||||
ASSERT_EQUAL(groupDimTestH[i].x, blockSize);
|
||||
ASSERT_EQUAL(groupDimTestH[i].y, 1);
|
||||
ASSERT_EQUAL(groupDimTestH[i].z, 1);
|
||||
}
|
||||
|
||||
// Free device memory
|
||||
HIPCHECK(hipFree(sizeTestD));
|
||||
HIPCHECK(hipFree(thdRankTestD));
|
||||
HIPCHECK(hipFree(syncTestD));
|
||||
HIPCHECK(hipFree(groupIndexTestD));
|
||||
HIPCHECK(hipFree(thdIndexTestD));
|
||||
HIPCHECK(hipFree(groupDimTestD));
|
||||
|
||||
//Free host memory
|
||||
HIPCHECK(hipHostFree(sizeTestH));
|
||||
HIPCHECK(hipHostFree(thdRankTestH));
|
||||
HIPCHECK(hipHostFree(syncTestH));
|
||||
HIPCHECK(hipHostFree(groupIndexTestH));
|
||||
HIPCHECK(hipHostFree(thdIndexTestH));
|
||||
HIPCHECK(hipHostFree(groupDimTestH));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGThreadBlockType") {
|
||||
// Use default device for validating the test
|
||||
int deviceId;
|
||||
hipDeviceProp_t deviceProperties;
|
||||
HIPCHECK(hipGetDevice(&deviceId));
|
||||
HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
|
||||
|
||||
if (!deviceProperties.cooperativeLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
|
||||
// Test for blockSizes in powers of 2
|
||||
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
|
||||
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
|
||||
test_cg_thread_block_type(blockSize);
|
||||
}
|
||||
|
||||
// Test for random blockSizes, but the sequence is the same every execution
|
||||
srand(0);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
// Test fails for only 1 thread per block
|
||||
test_cg_thread_block_type(max(2, rand() % maxThreadsPerBlock));
|
||||
}
|
||||
}
|
||||
@@ -1,136 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include "hip/hip_cooperative_groups.h"
|
||||
#include <cstdlib>
|
||||
|
||||
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
|
||||
|
||||
using namespace cooperative_groups;
|
||||
|
||||
static __global__
|
||||
void kernel_cg_thread_block_type_via_base_type(int *sizeTestD,
|
||||
int *thdRankTestD,
|
||||
int *syncTestD)
|
||||
{
|
||||
thread_group tg = this_thread_block();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test size
|
||||
sizeTestD[gIdx] = tg.size();
|
||||
|
||||
// Test thread_rank
|
||||
thdRankTestD[gIdx] = tg.thread_rank();
|
||||
|
||||
// Test sync
|
||||
__shared__ int sm[2];
|
||||
if (threadIdx.x == 0)
|
||||
sm[0] = 10;
|
||||
else if (threadIdx.x == 1)
|
||||
sm[1] = 20;
|
||||
tg.sync();
|
||||
syncTestD[gIdx] = sm[1] * sm[0];
|
||||
}
|
||||
|
||||
static void test_cg_thread_block_type_via_base_type(int blockSize)
|
||||
{
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int *sizeTestD, *sizeTestH;
|
||||
int *thdRankTestD, *thdRankTestH;
|
||||
int *syncTestD, *syncTestH;
|
||||
|
||||
// Allocate device memory
|
||||
HIPCHECK(hipMalloc(&sizeTestD, nBytes));
|
||||
HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
|
||||
HIPCHECK(hipMalloc(&syncTestD, nBytes));
|
||||
|
||||
// Allocate host memory
|
||||
HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
|
||||
HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
|
||||
HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
|
||||
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_cg_thread_block_type_via_base_type,
|
||||
2,
|
||||
blockSize,
|
||||
0,
|
||||
0,
|
||||
sizeTestD,
|
||||
thdRankTestD,
|
||||
syncTestD);
|
||||
|
||||
// Copy result from device to host
|
||||
HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
|
||||
// Validate results for both blocks together
|
||||
for (int i = 0; i < 2 * blockSize; ++i) {
|
||||
ASSERT_EQUAL(sizeTestH[i], blockSize);
|
||||
ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
|
||||
ASSERT_EQUAL(syncTestH[i], 200);
|
||||
}
|
||||
|
||||
// Free device memory
|
||||
HIPCHECK(hipFree(sizeTestD));
|
||||
HIPCHECK(hipFree(thdRankTestD));
|
||||
HIPCHECK(hipFree(syncTestD));
|
||||
|
||||
//Free host memory
|
||||
HIPCHECK(hipHostFree(sizeTestH));
|
||||
HIPCHECK(hipHostFree(thdRankTestH));
|
||||
HIPCHECK(hipHostFree(syncTestH));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGThreadBlockType_BaseType") {
|
||||
// Use default device for validating the test
|
||||
int deviceId;
|
||||
hipDeviceProp_t deviceProperties;
|
||||
HIPCHECK(hipGetDevice(&deviceId));
|
||||
HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
|
||||
|
||||
if (!deviceProperties.cooperativeLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
|
||||
// Test for blockSizes in powers of 2
|
||||
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
|
||||
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
|
||||
test_cg_thread_block_type_via_base_type(blockSize);
|
||||
}
|
||||
|
||||
// Test for random blockSizes, but the sequence is the same every execution
|
||||
srand(0);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
// Test fails for only 1 thread per block
|
||||
test_cg_thread_block_type_via_base_type(max(2, rand() % maxThreadsPerBlock));
|
||||
}
|
||||
}
|
||||
@@ -1,136 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/* HIT_START
|
||||
* BUILD: %t %s ../../test_common.cpp
|
||||
* TEST: %t
|
||||
* HIT_END
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include "hip/hip_cooperative_groups.h"
|
||||
#include <cstdlib>
|
||||
|
||||
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
|
||||
|
||||
using namespace cooperative_groups;
|
||||
|
||||
static __global__
|
||||
void kernel_cg_thread_block_type_via_public_api(int *sizeTestD,
|
||||
int *thdRankTestD,
|
||||
int *syncTestD)
|
||||
{
|
||||
thread_block tb = this_thread_block();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test group_size api
|
||||
sizeTestD[gIdx] = group_size(tb);
|
||||
|
||||
// Test thread_rank api
|
||||
thdRankTestD[gIdx] = thread_rank(tb);
|
||||
|
||||
// Test sync api
|
||||
__shared__ int sm[2];
|
||||
if (threadIdx.x == 0)
|
||||
sm[0] = 10;
|
||||
else if (threadIdx.x == 1)
|
||||
sm[1] = 20;
|
||||
sync(tb);
|
||||
syncTestD[gIdx] = sm[1] * sm[0];
|
||||
}
|
||||
|
||||
static void test_cg_thread_block_type_via_public_api(int blockSize)
|
||||
{
|
||||
int nBytes = sizeof(int) * 2 * blockSize;
|
||||
int *sizeTestD, *sizeTestH;
|
||||
int *thdRankTestD, *thdRankTestH;
|
||||
int *syncTestD, *syncTestH;
|
||||
|
||||
// Allocate device memory
|
||||
HIPCHECK(hipMalloc(&sizeTestD, nBytes));
|
||||
HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
|
||||
HIPCHECK(hipMalloc(&syncTestD, nBytes));
|
||||
|
||||
// Allocate host memory
|
||||
HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
|
||||
HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
|
||||
HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
|
||||
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_cg_thread_block_type_via_public_api,
|
||||
2,
|
||||
blockSize,
|
||||
0,
|
||||
0,
|
||||
sizeTestD,
|
||||
thdRankTestD,
|
||||
syncTestD);
|
||||
|
||||
// Copy result from device to host
|
||||
HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
|
||||
|
||||
// Validate results for both blocks together
|
||||
for (int i = 0; i < 2 * blockSize; ++i) {
|
||||
ASSERT_EQUAL(sizeTestH[i], blockSize);
|
||||
ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
|
||||
ASSERT_EQUAL(syncTestH[i], 200);
|
||||
}
|
||||
|
||||
// Free device memory
|
||||
HIPCHECK(hipFree(sizeTestD));
|
||||
HIPCHECK(hipFree(thdRankTestD));
|
||||
HIPCHECK(hipFree(syncTestD));
|
||||
|
||||
//Free host memory
|
||||
HIPCHECK(hipHostFree(sizeTestH));
|
||||
HIPCHECK(hipHostFree(thdRankTestH));
|
||||
HIPCHECK(hipHostFree(syncTestH));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGThreadBlockType_PublicApi") {
|
||||
// Use default device for validating the test
|
||||
int deviceId;
|
||||
hipDeviceProp_t deviceProperties;
|
||||
HIPCHECK(hipGetDevice(&deviceId));
|
||||
HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
|
||||
|
||||
if (!deviceProperties.cooperativeLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
|
||||
// Test for blockSizes in powers of 2
|
||||
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
|
||||
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
|
||||
test_cg_thread_block_type_via_public_api(blockSize);
|
||||
}
|
||||
|
||||
// Test for random blockSizes, but the sequence is the same every execution
|
||||
srand(0);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
// Test fails for only 1 thread per block
|
||||
test_cg_thread_block_type_via_public_api(max(2, rand() % maxThreadsPerBlock));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,225 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
|
||||
#include "hip_cg_common.hh"
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
enum class ThreadBlockTypeTests { basicApi, baseType, publicApi };
|
||||
|
||||
static __global__ void kernel_cg_thread_block_type(int* size_dev, int* thd_rank_dev, int* sync_dev,
|
||||
dim3* group_index_dev, dim3* thd_index_dev,
|
||||
dim3* group_dim_dev) {
|
||||
cg::thread_block tb = cg::this_thread_block();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
// Test size
|
||||
size_dev[gIdx] = tb.size();
|
||||
|
||||
// Test thread_rank
|
||||
thd_rank_dev[gIdx] = tb.thread_rank();
|
||||
|
||||
// Test sync
|
||||
__shared__ int sm[2];
|
||||
if (threadIdx.x == 0)
|
||||
sm[0] = 10;
|
||||
else if (threadIdx.x == 1)
|
||||
sm[1] = 20;
|
||||
tb.sync();
|
||||
sync_dev[gIdx] = sm[1] * sm[0];
|
||||
|
||||
// Test group_index
|
||||
group_index_dev[gIdx] = tb.group_index();
|
||||
|
||||
// Test thread_index
|
||||
thd_index_dev[gIdx] = tb.thread_index();
|
||||
|
||||
// Test group_dim aka number of threads in a block
|
||||
group_dim_dev[gIdx] = tb.group_dim();
|
||||
}
|
||||
|
||||
static __global__ void kernel_cg_thread_block_type_via_base_type(int* size_dev, int* thd_rank_dev,
|
||||
int* sync_dev) {
|
||||
cg::thread_group tg = cg::this_thread_block();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test size
|
||||
size_dev[gIdx] = tg.size();
|
||||
|
||||
// Test thread_rank
|
||||
thd_rank_dev[gIdx] = tg.thread_rank();
|
||||
|
||||
// Test sync
|
||||
__shared__ int sm[2];
|
||||
if (threadIdx.x == 0)
|
||||
sm[0] = 10;
|
||||
else if (threadIdx.x == 1)
|
||||
sm[1] = 20;
|
||||
tg.sync();
|
||||
sync_dev[gIdx] = sm[1] * sm[0];
|
||||
}
|
||||
|
||||
static __global__ void kernel_cg_thread_block_type_via_public_api(int* size_dev, int* thd_rank_dev,
|
||||
int* sync_dev) {
|
||||
cg::thread_block tb = cg::this_thread_block();
|
||||
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
|
||||
// Test group_size api
|
||||
size_dev[gIdx] = cg::group_size(tb);
|
||||
|
||||
// Test thread_rank api
|
||||
thd_rank_dev[gIdx] = cg::thread_rank(tb);
|
||||
|
||||
// Test sync api
|
||||
__shared__ int sm[2];
|
||||
if (threadIdx.x == 0)
|
||||
sm[0] = 10;
|
||||
else if (threadIdx.x == 1)
|
||||
sm[1] = 20;
|
||||
cg::sync(tb);
|
||||
sync_dev[gIdx] = sm[1] * sm[0];
|
||||
}
|
||||
|
||||
static void test_cg_thread_block_type(ThreadBlockTypeTests test_type, int block_size) {
|
||||
int num_bytes = sizeof(int) * 2 * block_size;
|
||||
int num_dim3_bytes = sizeof(dim3) * 2 * block_size;
|
||||
int *size_dev, *size_host;
|
||||
int *thd_rank_dev, *thd_rank_host;
|
||||
int *sync_dev, *sync_host;
|
||||
dim3 *group_index_dev, *group_index_host;
|
||||
dim3 *thd_index_dev, *thd_index_host;
|
||||
dim3 *group_dim_dev, *group_dim_host;
|
||||
|
||||
// Allocate device memory
|
||||
HIP_CHECK(hipMalloc(&size_dev, num_bytes));
|
||||
HIP_CHECK(hipMalloc(&thd_rank_dev, num_bytes));
|
||||
HIP_CHECK(hipMalloc(&sync_dev, num_bytes));
|
||||
|
||||
// Allocate host memory
|
||||
HIP_CHECK(hipHostMalloc(&size_host, num_bytes));
|
||||
HIP_CHECK(hipHostMalloc(&thd_rank_host, num_bytes));
|
||||
HIP_CHECK(hipHostMalloc(&sync_host, num_bytes));
|
||||
|
||||
switch (test_type) {
|
||||
case (ThreadBlockTypeTests::basicApi):
|
||||
HIP_CHECK(hipMalloc(&group_index_dev, num_dim3_bytes));
|
||||
HIP_CHECK(hipMalloc(&thd_index_dev, num_dim3_bytes));
|
||||
HIP_CHECK(hipMalloc(&group_dim_dev, num_dim3_bytes));
|
||||
HIP_CHECK(hipHostMalloc(&group_index_host, num_dim3_bytes));
|
||||
HIP_CHECK(hipHostMalloc(&thd_index_host, num_dim3_bytes));
|
||||
HIP_CHECK(hipHostMalloc(&group_dim_host, num_dim3_bytes));
|
||||
|
||||
hipLaunchKernelGGL(kernel_cg_thread_block_type, 2, block_size, 0, 0, size_dev, thd_rank_dev,
|
||||
sync_dev, group_index_dev, thd_index_dev, group_dim_dev);
|
||||
break;
|
||||
case (ThreadBlockTypeTests::baseType):
|
||||
hipLaunchKernelGGL(kernel_cg_thread_block_type_via_base_type, 2, block_size, 0, 0, size_dev,
|
||||
thd_rank_dev, sync_dev);
|
||||
break;
|
||||
case (ThreadBlockTypeTests::publicApi):
|
||||
hipLaunchKernelGGL(kernel_cg_thread_block_type_via_public_api, 2, block_size, 0, 0, size_dev,
|
||||
thd_rank_dev, sync_dev);
|
||||
}
|
||||
|
||||
// Copy result from device to host
|
||||
HIP_CHECK(hipMemcpy(size_host, size_dev, num_bytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(thd_rank_host, thd_rank_dev, num_bytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(sync_host, sync_dev, num_bytes, hipMemcpyDeviceToHost));
|
||||
if (test_type == ThreadBlockTypeTests::basicApi) {
|
||||
HIP_CHECK(hipMemcpy(group_index_host, group_index_dev, num_dim3_bytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(thd_index_host, thd_index_dev, num_dim3_bytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipMemcpy(group_dim_host, group_dim_dev, num_dim3_bytes, hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
// Validate results for both blocks together
|
||||
for (int i = 0; i < 2 * block_size; ++i) {
|
||||
ASSERT_EQUAL(size_host[i], block_size);
|
||||
ASSERT_EQUAL(thd_rank_host[i], i % block_size);
|
||||
ASSERT_EQUAL(sync_host[i], 200);
|
||||
if (test_type == ThreadBlockTypeTests::basicApi) {
|
||||
ASSERT_EQUAL(group_index_host[i].x, (uint)i / block_size);
|
||||
ASSERT_EQUAL(group_index_host[i].y, 0);
|
||||
ASSERT_EQUAL(group_index_host[i].z, 0);
|
||||
ASSERT_EQUAL(thd_index_host[i].x, (uint)i % block_size);
|
||||
ASSERT_EQUAL(thd_index_host[i].y, 0);
|
||||
ASSERT_EQUAL(thd_index_host[i].z, 0);
|
||||
ASSERT_EQUAL(group_dim_host[i].x, block_size);
|
||||
ASSERT_EQUAL(group_dim_host[i].y, 1);
|
||||
ASSERT_EQUAL(group_dim_host[i].z, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Free device memory
|
||||
HIP_CHECK(hipFree(size_dev));
|
||||
HIP_CHECK(hipFree(thd_rank_dev));
|
||||
HIP_CHECK(hipFree(sync_dev));
|
||||
|
||||
// Free host memory
|
||||
HIP_CHECK(hipHostFree(size_host));
|
||||
HIP_CHECK(hipHostFree(thd_rank_host));
|
||||
HIP_CHECK(hipHostFree(sync_host));
|
||||
|
||||
if (test_type == ThreadBlockTypeTests::basicApi) {
|
||||
HIP_CHECK(hipFree(group_index_dev));
|
||||
HIP_CHECK(hipFree(thd_index_dev));
|
||||
HIP_CHECK(hipFree(group_dim_dev));
|
||||
HIP_CHECK(hipHostFree(group_index_host));
|
||||
HIP_CHECK(hipHostFree(thd_index_host));
|
||||
HIP_CHECK(hipHostFree(group_dim_host));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
TEST_CASE("Unit_hipCGThreadBlockType") {
|
||||
// Use default device for validating the test
|
||||
int device;
|
||||
hipDeviceProp_t device_properties;
|
||||
HIP_CHECK(hipGetDevice(&device));
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
||||
|
||||
if (!device_properties.cooperativeLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
|
||||
ThreadBlockTypeTests test_type = ThreadBlockTypeTests::basicApi;
|
||||
|
||||
SECTION("Default thread block API test") { test_type = ThreadBlockTypeTests::basicApi; }
|
||||
|
||||
SECTION("Base type thread block API test") { test_type = ThreadBlockTypeTests::baseType; }
|
||||
|
||||
SECTION("Public API thread block test") { test_type = ThreadBlockTypeTests::publicApi; }
|
||||
|
||||
// Test for blockSizes in powers of 2
|
||||
int max_threads_per_blk = device_properties.maxThreadsPerBlock;
|
||||
for (int block_size = 2; block_size <= max_threads_per_blk; block_size = block_size * 2) {
|
||||
test_cg_thread_block_type(test_type, block_size);
|
||||
}
|
||||
|
||||
// Test for random block_size, but the sequence is the same every execution
|
||||
srand(0);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
// Test fails for only 1 thread per block
|
||||
test_cg_thread_block_type(test_type, max(2, rand() % max_threads_per_blk));
|
||||
}
|
||||
}
|
||||
@@ -1,385 +0,0 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
// Test Description:
|
||||
/* This test implements sum reduction kernel, first with each threads own rank
|
||||
as input and comparing the sum with expected sum output derieved from n(n-1)/2
|
||||
formula. The second part, partitions this parent group into child subgroups
|
||||
a.k.a tiles using using tiled_partition() collective operation. This can be called
|
||||
with a static tile size, passed in templated non-type variable-tiled_partition<tileSz>,
|
||||
or in runtime as tiled_partition(thread_group parent, tileSz). This test covers both these
|
||||
cases.
|
||||
This test tests functionality of cg group partitioning, (static and dynamic) and its respective
|
||||
API's size(), thread_rank(), and sync().
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
|
||||
using namespace cooperative_groups;
|
||||
|
||||
/* Parallel reduce kernel.
|
||||
*
|
||||
* Step complexity: O(log n)
|
||||
* Work complexity: O(n)
|
||||
*
|
||||
* Note: This kernel works only with power of 2 input arrays.
|
||||
*/
|
||||
__device__ int reduction_kernel(thread_group g, int* x, int val) {
|
||||
int lane = g.thread_rank();
|
||||
|
||||
for (int i = g.size() / 2; i > 0; i /= 2) {
|
||||
// use lds to store the temporary result
|
||||
x[lane] = val;
|
||||
// Ensure all the stores are completed.
|
||||
g.sync();
|
||||
|
||||
if (lane < i) {
|
||||
val += x[lane + i];
|
||||
}
|
||||
// It must work on one tiled thread group at a time,
|
||||
// and it must make sure all memory operations are
|
||||
// completed before moving to the next stride.
|
||||
// sync() here just does that.
|
||||
g.sync();
|
||||
}
|
||||
|
||||
// Choose the 0'th indexed thread that holds the reduction value to return
|
||||
if (g.thread_rank() == 0) {
|
||||
return val;
|
||||
}
|
||||
// Rest of the threads return no useful values
|
||||
else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int tileSz>
|
||||
__global__ void kernel_cg_group_partition_static(int* result, bool isGlobalMem, int* globalMem) {
|
||||
thread_block threadBlockCGTy = this_thread_block();
|
||||
int threadBlockGroupSize = threadBlockCGTy.size();
|
||||
|
||||
int* workspace = NULL;
|
||||
|
||||
if (isGlobalMem) {
|
||||
workspace = globalMem;
|
||||
} else {
|
||||
// Declare a shared memory
|
||||
extern __shared__ int sharedMem[];
|
||||
workspace = sharedMem;
|
||||
}
|
||||
|
||||
int input, outputSum, expectedOutput;
|
||||
|
||||
// we pass its own thread rank as inputs
|
||||
input = threadBlockCGTy.thread_rank();
|
||||
|
||||
expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
|
||||
|
||||
outputSum = reduction_kernel(threadBlockCGTy, workspace, input);
|
||||
|
||||
// Choose a leader thread to print the results
|
||||
if (threadBlockCGTy.thread_rank() == 0) {
|
||||
printf(" Sum of all ranks 0..%d in threadBlockCooperativeGroup is %d (expected %d)\n\n",
|
||||
(int)threadBlockCGTy.size() - 1, outputSum, expectedOutput);
|
||||
printf(" Creating %d groups, of tile size %d threads:\n\n",
|
||||
(int)threadBlockCGTy.size() / tileSz, tileSz);
|
||||
}
|
||||
|
||||
threadBlockCGTy.sync();
|
||||
|
||||
thread_block_tile<tileSz> tiledPartition = tiled_partition<tileSz>(threadBlockCGTy);
|
||||
|
||||
// This offset allows each group to have its own unique area in the workspace array
|
||||
int workspaceOffset = threadBlockCGTy.thread_rank() - tiledPartition.thread_rank();
|
||||
|
||||
outputSum = reduction_kernel(tiledPartition, workspace + workspaceOffset, input);
|
||||
|
||||
if (tiledPartition.thread_rank() == 0) {
|
||||
printf(
|
||||
" Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread "
|
||||
"rank via meta_group_rank : %d and the total number of groups created when partitioned : %d\n",
|
||||
tiledPartition.size() - 1, outputSum, tiledPartition.meta_group_rank(), tiledPartition.meta_group_size());
|
||||
result[input / (tileSz)] = outputSum;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
__global__ void kernel_cg_group_partition_dynamic(unsigned int tileSz, int* result,
|
||||
bool isGlobalMem, int* globalMem) {
|
||||
thread_block threadBlockCGTy = this_thread_block();
|
||||
|
||||
int* workspace = NULL;
|
||||
|
||||
if (isGlobalMem) {
|
||||
workspace = globalMem;
|
||||
} else {
|
||||
// Declare a shared memory
|
||||
extern __shared__ int sharedMem[];
|
||||
workspace = sharedMem;
|
||||
}
|
||||
|
||||
int input, outputSum;
|
||||
|
||||
// input to reduction, for each thread, is its' rank in the group
|
||||
input = threadBlockCGTy.thread_rank();
|
||||
|
||||
outputSum = reduction_kernel(threadBlockCGTy, workspace, input);
|
||||
|
||||
if (threadBlockCGTy.thread_rank() == 0) {
|
||||
printf(" Sum of all ranks 0..%d in threadBlockCooperativeGroup is %d\n\n",
|
||||
(int)threadBlockCGTy.size() - 1, outputSum);
|
||||
printf(" Creating %d groups, of tile size %d threads:\n\n",
|
||||
(int)threadBlockCGTy.size() / tileSz, tileSz);
|
||||
}
|
||||
|
||||
threadBlockCGTy.sync();
|
||||
|
||||
thread_group tiledPartition = tiled_partition(threadBlockCGTy, tileSz);
|
||||
|
||||
// This offset allows each group to have its own unique area in the workspace array
|
||||
int workspaceOffset = threadBlockCGTy.thread_rank() - tiledPartition.thread_rank();
|
||||
|
||||
outputSum = reduction_kernel(tiledPartition, workspace + workspaceOffset, input);
|
||||
|
||||
if (tiledPartition.thread_rank() == 0) {
|
||||
printf(
|
||||
" Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread "
|
||||
" %d\n", tiledPartition.size() - 1, outputSum, input);
|
||||
result[input / (tileSz)] = outputSum;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Search if the sum exists in the expected results array
|
||||
void verifyResults(int* hPtr, int* dPtr, int size) {
|
||||
int i = 0, j = 0;
|
||||
for (i = 0; i < size; i++) {
|
||||
for (j = 0; j < size; j++) {
|
||||
if (hPtr[i] == dPtr[j]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (j == size) {
|
||||
REQUIRE(" Result verification failed!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int tileSz> static void test_group_partition(bool useGlobalMem) {
|
||||
hipError_t err;
|
||||
int blockSize = 1;
|
||||
int threadsPerBlock = 64;
|
||||
|
||||
int numTiles = (blockSize * threadsPerBlock) / tileSz;
|
||||
|
||||
// Build an array of expected reduction sum output on the host
|
||||
// based on the sum of their respective thread ranks for verification.
|
||||
// eg: parent group has 64threads.
|
||||
// child thread ranks: 0-15, 16-31, 32-47, 48-63
|
||||
// expected sum: 120, 376, 632, 888
|
||||
int* expectedSum = new int[numTiles];
|
||||
int temp = 0, sum = 0;
|
||||
|
||||
for (int i = 1; i <= numTiles; i++) {
|
||||
sum = temp;
|
||||
temp = (((tileSz * i) - 1) * (tileSz * i)) / 2;
|
||||
expectedSum[i-1] = temp - sum;
|
||||
}
|
||||
|
||||
int* dResult = NULL;
|
||||
HIPCHECK(hipMalloc((void**)&dResult, numTiles * sizeof(int)));
|
||||
|
||||
int* globalMem = NULL;
|
||||
if (useGlobalMem) {
|
||||
HIPCHECK(hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int)));
|
||||
}
|
||||
|
||||
int* hResult = NULL;
|
||||
HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault));
|
||||
memset(hResult, 0, numTiles * sizeof(int));
|
||||
|
||||
if (useGlobalMem) {
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_cg_group_partition_static<tileSz>, blockSize, threadsPerBlock, 0, 0,
|
||||
dResult, useGlobalMem, globalMem);
|
||||
err = hipDeviceSynchronize();
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
|
||||
}
|
||||
} else {
|
||||
// Launch Kernel
|
||||
hipLaunchKernelGGL(kernel_cg_group_partition_static<tileSz>, blockSize, threadsPerBlock,
|
||||
threadsPerBlock * sizeof(int), 0, dResult, useGlobalMem, globalMem);
|
||||
err = hipDeviceSynchronize();
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
|
||||
}
|
||||
}
|
||||
|
||||
HIPCHECK(hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost));
|
||||
|
||||
verifyResults(expectedSum, hResult, numTiles);
|
||||
|
||||
// Free all allocated memory on host and device
|
||||
HIPCHECK(hipFree(dResult));
|
||||
HIPCHECK(hipFree(hResult));
|
||||
if (useGlobalMem) {
|
||||
HIPCHECK(hipFree(globalMem));
|
||||
}
|
||||
delete[] expectedSum;
|
||||
|
||||
printf("\n...PASSED.\n\n");
|
||||
}
|
||||
|
||||
static void test_group_partition(unsigned int tileSz, bool useGlobalMem) {
|
||||
hipError_t err;
|
||||
int blockSize = 1;
|
||||
int threadsPerBlock = 64;
|
||||
|
||||
int numTiles = (blockSize * threadsPerBlock) / tileSz;
|
||||
// Build an array of expected reduction sum output on the host
|
||||
// based on the sum of their respective thread ranks to use for verification
|
||||
int* expectedSum = new int[numTiles];
|
||||
int temp = 0, sum = 0;
|
||||
for (int i = 1; i <= numTiles; i++) {
|
||||
sum = temp;
|
||||
temp = (((tileSz * i) - 1) * (tileSz * i)) / 2;
|
||||
expectedSum[i-1] = temp - sum;
|
||||
}
|
||||
|
||||
int* dResult = NULL;
|
||||
HIPCHECK(hipMalloc(&dResult, sizeof(int) * numTiles));
|
||||
|
||||
int* globalMem = NULL;
|
||||
if (useGlobalMem) {
|
||||
HIPCHECK(hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int)));
|
||||
}
|
||||
|
||||
int* hResult = NULL;
|
||||
HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault));
|
||||
memset(hResult, 0, numTiles * sizeof(int));
|
||||
|
||||
// Launch Kernel
|
||||
if (useGlobalMem) {
|
||||
hipLaunchKernelGGL(kernel_cg_group_partition_dynamic, blockSize, threadsPerBlock, 0, 0, tileSz,
|
||||
dResult, useGlobalMem, globalMem);
|
||||
|
||||
err = hipDeviceSynchronize();
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
|
||||
}
|
||||
} else {
|
||||
hipLaunchKernelGGL(kernel_cg_group_partition_dynamic, blockSize, threadsPerBlock,
|
||||
threadsPerBlock * sizeof(int), 0, tileSz, dResult, useGlobalMem, globalMem);
|
||||
|
||||
err = hipDeviceSynchronize();
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
|
||||
}
|
||||
}
|
||||
|
||||
HIPCHECK(hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost));
|
||||
|
||||
verifyResults(expectedSum, hResult, numTiles);
|
||||
|
||||
// Free all allocated memory on host and device
|
||||
HIPCHECK(hipFree(dResult));
|
||||
HIPCHECK(hipFree(hResult));
|
||||
if (useGlobalMem) {
|
||||
HIPCHECK(hipFree(globalMem));
|
||||
}
|
||||
delete[] expectedSum;
|
||||
|
||||
printf("\n...PASSED.\n\n");
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_tiled_partition") {
|
||||
// Use default device for validating the test
|
||||
int deviceId;
|
||||
HIP_CHECK_ERROR(hipGetDevice(&deviceId), hipSuccess);
|
||||
hipDeviceProp_t deviceProperties;
|
||||
HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
|
||||
|
||||
if (!deviceProperties.cooperativeLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
}
|
||||
|
||||
bool useGlobalMem = true;
|
||||
std::cout << "Testing static tiled_partition for different tile sizes" << std::endl;
|
||||
std::cout << "\nUsing global memory for computation\n";
|
||||
/* Test static tile_partition */
|
||||
std::cout << "TEST 1:" << '\n' << std::endl;
|
||||
test_group_partition<2>(useGlobalMem);
|
||||
std::cout << "TEST 2:" << '\n' << std::endl;
|
||||
test_group_partition<4>(useGlobalMem);
|
||||
std::cout << "TEST 3:" << '\n' << std::endl;
|
||||
test_group_partition<8>(useGlobalMem);
|
||||
std::cout << "TEST 4:" << '\n' << std::endl;
|
||||
test_group_partition<16>(useGlobalMem);
|
||||
std::cout << "TEST 5:" << '\n' << std::endl;
|
||||
test_group_partition<32>(useGlobalMem);
|
||||
|
||||
useGlobalMem = false;
|
||||
std::cout << "Testing static tiled_partition for different tile sizes" << std::endl;
|
||||
std::cout << "\nUsing shared memory for computation\n";
|
||||
/* Test static tile_partition */
|
||||
std::cout << "TEST 1:" << '\n' << std::endl;
|
||||
test_group_partition<2>(useGlobalMem);
|
||||
std::cout << "TEST 2:" << '\n' << std::endl;
|
||||
test_group_partition<4>(useGlobalMem);
|
||||
std::cout << "TEST 3:" << '\n' << std::endl;
|
||||
test_group_partition<8>(useGlobalMem);
|
||||
std::cout << "TEST 4:" << '\n' << std::endl;
|
||||
test_group_partition<16>(useGlobalMem);
|
||||
std::cout << "TEST 5:" << '\n' << std::endl;
|
||||
test_group_partition<32>(useGlobalMem);
|
||||
|
||||
|
||||
std::cout << "Now testing dynamic tiled_partition for different tile sizes" << '\n' << std::endl;
|
||||
|
||||
/* Test dynamic group partition*/
|
||||
useGlobalMem = true;
|
||||
int testNo = 1;
|
||||
std::vector<unsigned int> tileSizes = {2, 4, 8, 16, 32};
|
||||
std::cout << "\nUsing global memory for computation\n";
|
||||
for (auto i : tileSizes) {
|
||||
std::cout << "TEST " << testNo << ":" << '\n' << std::endl;
|
||||
test_group_partition(i, useGlobalMem);
|
||||
testNo++;
|
||||
}
|
||||
|
||||
useGlobalMem = false;
|
||||
testNo = 1;
|
||||
std::cout << "\nUsing shared memory for computation\n";
|
||||
for (auto i : tileSizes) {
|
||||
std::cout << "TEST " << testNo << ":" << '\n' << std::endl;
|
||||
test_group_partition(i, useGlobalMem);
|
||||
testNo++;
|
||||
}
|
||||
printf("\n...PASSED.\n\n");
|
||||
return;
|
||||
}
|
||||
@@ -0,0 +1,279 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
// Test Description:
|
||||
/* This test implements sum reduction kernel, first with each threads own rank
|
||||
as input and comparing the sum with expected sum output derieved from n(n-1)/2
|
||||
formula. The second part, partitions this parent group into child subgroups
|
||||
a.k.a tiles using using tiled_partition() collective operation. This can be called
|
||||
with a static tile size, passed in templated non-type variable-tiled_partition<tileSz>,
|
||||
or in runtime as tiled_partition(thread_group parent, tileSz). This test covers both these
|
||||
cases.
|
||||
This test tests functionality of cg group partitioning, (static and dynamic) and its respective
|
||||
API's size(), thread_rank(), and sync().
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "hip_cg_common.hh"
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
/* Parallel reduce kernel.
|
||||
*
|
||||
* Step complexity: O(log n)
|
||||
* Work complexity: O(n)
|
||||
*
|
||||
* Note: This kernel works only with power of 2 input arrays.
|
||||
*/
|
||||
__device__ int reduction_kernel(cg::thread_group g, int* x, int val) {
|
||||
int lane = g.thread_rank();
|
||||
|
||||
for (int i = g.size() / 2; i > 0; i /= 2) {
|
||||
// use lds to store the temporary result
|
||||
x[lane] = val;
|
||||
// Ensure all the stores are completed.
|
||||
g.sync();
|
||||
|
||||
if (lane < i) {
|
||||
val += x[lane + i];
|
||||
}
|
||||
// It must work on one tiled thread group at a time,
|
||||
// and it must make sure all memory operations are
|
||||
// completed before moving to the next stride.
|
||||
// sync() here just does that.
|
||||
g.sync();
|
||||
}
|
||||
|
||||
// Choose the 0'th indexed thread that holds the reduction value to return
|
||||
if (g.thread_rank() == 0) {
|
||||
return val;
|
||||
}
|
||||
// Rest of the threads return no useful values
|
||||
else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int tile_size>
|
||||
__global__ void kernel_cg_group_partition_static(int* result, bool is_global_mem, int* global_mem) {
|
||||
cg::thread_block thread_block_CG_ty = cg::this_thread_block();
|
||||
|
||||
int* workspace = NULL;
|
||||
|
||||
if (is_global_mem) {
|
||||
workspace = global_mem;
|
||||
} else {
|
||||
// Declare a shared memory
|
||||
extern __shared__ int shared_mem[];
|
||||
workspace = shared_mem;
|
||||
}
|
||||
|
||||
int input, output_sum, expected_output;
|
||||
|
||||
// input to reduction, for each thread, is its' rank in the group
|
||||
input = thread_block_CG_ty.thread_rank();
|
||||
|
||||
expected_output = (thread_block_CG_ty.size() - 1) * thread_block_CG_ty.size() / 2;
|
||||
|
||||
output_sum = reduction_kernel(thread_block_CG_ty, workspace, input);
|
||||
|
||||
if (thread_block_CG_ty.thread_rank() == 0) {
|
||||
printf(" Sum of all ranks 0..%d in threadBlockCooperativeGroup is %d (expected %d)\n\n",
|
||||
(int)thread_block_CG_ty.size() - 1, output_sum, expected_output);
|
||||
printf(" Creating %d groups, of tile size %d threads:\n\n",
|
||||
(int)thread_block_CG_ty.size() / tile_size, tile_size);
|
||||
}
|
||||
|
||||
thread_block_CG_ty.sync();
|
||||
|
||||
cg::thread_block_tile<tile_size> tiled_part = cg::tiled_partition<tile_size>(thread_block_CG_ty);
|
||||
|
||||
// This offset allows each group to have its own unique area in the workspace array
|
||||
int workspace_offset = thread_block_CG_ty.thread_rank() - tiled_part.thread_rank();
|
||||
|
||||
output_sum = reduction_kernel(tiled_part, workspace + workspace_offset, input);
|
||||
|
||||
if (tiled_part.thread_rank() == 0) {
|
||||
printf(
|
||||
" Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread "
|
||||
"rank: via meta_group_rank : %d and the total number of groups created when partitioned : "
|
||||
"%d\n",
|
||||
tiled_part.size() - 1, output_sum, tiled_part.meta_group_rank(),
|
||||
tiled_part.meta_group_size());
|
||||
result[input / (tile_size)] = output_sum;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
__global__ void kernel_cg_group_partition_dynamic(unsigned int tile_size, int* result,
|
||||
bool is_global_mem, int* global_mem) {
|
||||
cg::thread_block thread_block_CG_ty = cg::this_thread_block();
|
||||
|
||||
int* workspace = NULL;
|
||||
|
||||
if (is_global_mem) {
|
||||
workspace = global_mem;
|
||||
} else {
|
||||
// Declare a shared memory
|
||||
extern __shared__ int shared_mem[];
|
||||
workspace = shared_mem;
|
||||
}
|
||||
|
||||
int input, output_sum;
|
||||
|
||||
// input to reduction, for each thread, is its' rank in the group
|
||||
input = thread_block_CG_ty.thread_rank();
|
||||
|
||||
output_sum = reduction_kernel(thread_block_CG_ty, workspace, input);
|
||||
|
||||
if (thread_block_CG_ty.thread_rank() == 0) {
|
||||
printf("\n\n\n Sum of all ranks 0..%d in threadBlockCooperativeGroup is %d\n\n",
|
||||
(int)thread_block_CG_ty.size() - 1, output_sum);
|
||||
printf(" Creating %d groups, of tile size %d threads:\n\n",
|
||||
(int)thread_block_CG_ty.size() / tile_size, tile_size);
|
||||
}
|
||||
|
||||
thread_block_CG_ty.sync();
|
||||
|
||||
cg::thread_group tiled_part = cg::tiled_partition(thread_block_CG_ty, tile_size);
|
||||
|
||||
// This offset allows each group to have its own unique area in the workspace array
|
||||
int workspace_offset = thread_block_CG_ty.thread_rank() - tiled_part.thread_rank();
|
||||
|
||||
output_sum = reduction_kernel(tiled_part, workspace + workspace_offset, input);
|
||||
|
||||
if (tiled_part.thread_rank() == 0) {
|
||||
printf(
|
||||
" Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread "
|
||||
"rank: %d\n",
|
||||
static_cast<int>(tiled_part.size()) - 1, output_sum, input);
|
||||
result[input / (tile_size)] = output_sum;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
static void common_group_partition(F kernel_func, unsigned int tile_size, void** params,
|
||||
size_t num_params, bool use_global_mem) {
|
||||
int block_size = 1;
|
||||
int threads_per_blk = 64;
|
||||
|
||||
int num_tiles = (block_size * threads_per_blk) / tile_size;
|
||||
|
||||
// Build an array of expected reduction sum output on the host
|
||||
// based on the sum of their respective thread ranks for verification.
|
||||
// eg: parent group has 64threads.
|
||||
// child thread ranks: 0-15, 16-31, 32-47, 48-63
|
||||
// expected sum: 120, 376, 632, 888
|
||||
int* expected_sum = new int[num_tiles];
|
||||
int temp = 0, sum = 0;
|
||||
|
||||
for (int i = 1; i <= num_tiles; i++) {
|
||||
sum = temp;
|
||||
temp = (((tile_size * i) - 1) * (tile_size * i)) / 2;
|
||||
expected_sum[i - 1] = temp - sum;
|
||||
}
|
||||
|
||||
int* result_dev = NULL;
|
||||
HIP_CHECK(hipMalloc((void**)&result_dev, num_tiles * sizeof(int)));
|
||||
|
||||
int* global_mem = NULL;
|
||||
if (use_global_mem) {
|
||||
HIP_CHECK(hipMalloc((void**)&global_mem, threads_per_blk * sizeof(int)));
|
||||
}
|
||||
|
||||
int* result_host = NULL;
|
||||
HIP_CHECK(hipHostMalloc(&result_host, num_tiles * sizeof(int), hipHostMallocDefault));
|
||||
memset(result_host, 0, num_tiles * sizeof(int));
|
||||
|
||||
params[num_params + 0] = &result_dev;
|
||||
params[num_params + 1] = &use_global_mem;
|
||||
params[num_params + 2] = &global_mem;
|
||||
|
||||
if (use_global_mem) {
|
||||
// Launch Kernel
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(kernel_func, block_size, threads_per_blk, params, 0, 0));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
} else {
|
||||
// Launch Kernel
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(kernel_func, block_size, threads_per_blk, params,
|
||||
threads_per_blk * sizeof(int), 0));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemcpy(result_host, result_dev, num_tiles * sizeof(int), hipMemcpyDeviceToHost));
|
||||
|
||||
verifyResults(expected_sum, result_host, num_tiles);
|
||||
|
||||
// Free all allocated memory on host and device
|
||||
HIP_CHECK(hipFree(result_dev));
|
||||
HIP_CHECK(hipHostFree(result_host));
|
||||
if (use_global_mem) {
|
||||
HIP_CHECK(hipFree(global_mem));
|
||||
}
|
||||
delete[] expected_sum;
|
||||
}
|
||||
|
||||
template <unsigned int tile_size> static void test_group_partition(bool use_global_mem) {
|
||||
void* params[3];
|
||||
size_t num_params = 0;
|
||||
common_group_partition(kernel_cg_group_partition_static<tile_size>, tile_size, params, num_params,
|
||||
use_global_mem);
|
||||
}
|
||||
|
||||
static void test_group_partition(unsigned int tile_size, bool use_global_mem) {
|
||||
void* params[4];
|
||||
params[0] = &tile_size;
|
||||
size_t num_params = 1;
|
||||
common_group_partition(kernel_cg_group_partition_dynamic, tile_size, params, num_params,
|
||||
use_global_mem);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipCGThreadBlockTileType") {
|
||||
// Use default device for validating the test
|
||||
int device;
|
||||
hipDeviceProp_t device_properties;
|
||||
HIP_CHECK(hipGetDevice(&device));
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
||||
|
||||
if (!device_properties.cooperativeLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
|
||||
bool use_global_mem = GENERATE(true, false);
|
||||
|
||||
SECTION("Static tile partition") {
|
||||
test_group_partition<2>(use_global_mem);
|
||||
test_group_partition<4>(use_global_mem);
|
||||
test_group_partition<8>(use_global_mem);
|
||||
test_group_partition<16>(use_global_mem);
|
||||
test_group_partition<32>(use_global_mem);
|
||||
}
|
||||
|
||||
SECTION("Dynamic tile partition") {
|
||||
unsigned int tile_size = GENERATE(2, 4, 8, 16, 32);
|
||||
test_group_partition(tile_size, use_global_mem);
|
||||
}
|
||||
}
|
||||
+606
@@ -0,0 +1,606 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
// Test Description:
|
||||
/*The general idea of the application is to test how multi-GPU Cooperative
|
||||
Groups kernel launches to a stream interact with other things that may be
|
||||
simultaneously running in the same streams.
|
||||
|
||||
The HIP specification says that a multi-GPU cooperative launch will wait
|
||||
until all of the streams it's using finish their work. Only then will the
|
||||
cooperative kernel be launched to all of the devices. Then no other work
|
||||
can take part in the any of the streams until all of the multi-GPU
|
||||
cooperative work is done.
|
||||
|
||||
However, there are flags that allow you to disable each of these
|
||||
serialization points: hipCooperativeLaunchMultiDeviceNoPreSync and
|
||||
hipCooperativeLaunchMultiDeviceNoPostSync.
|
||||
|
||||
As such, this benchmark tests the following five situations launching
|
||||
to two GPUs (and thus two streams):
|
||||
|
||||
1. Normal multi-GPU cooperative kernel:
|
||||
This should result in the following pattern:
|
||||
Stream 0: Cooperative
|
||||
Stream 1: Cooperative
|
||||
2. Regular kernel launches and multi-GPU cooperative kernel launches
|
||||
with the default flags, resulting in the following pattern:
|
||||
Stream 0: Regular --> Cooperative
|
||||
Stream 1: --> Cooperative --> Regular
|
||||
|
||||
3. Regular kernel launches and multi-GPU cooperative kernel launches
|
||||
that turn off "pre-sync". This should allow a cooperative kernel
|
||||
to launch even if work is already in a stream pointing to
|
||||
another GPU.
|
||||
This should result in the following pattern:
|
||||
Stream 0: Regular --> Cooperative
|
||||
Stream 1: Cooperative --> Regular
|
||||
|
||||
4. Regular kernel launches and multi-GPU cooperative kernel launches
|
||||
that turn off "post-sync". This should allow a new kernel to enter
|
||||
a GPU even if another GPU still has a cooperative kernel on it.
|
||||
This should result in the following pattern:
|
||||
Stream 0: Regular --> Cooperative
|
||||
Stream 1: --> Cooperative--> Regular
|
||||
|
||||
5. Regular kernel launches and multi-GPU cooperative kernel launches
|
||||
that turn off both pre- and post-sync. This should allow any of
|
||||
the kernels to launch to their GPU regardless of the status of
|
||||
other kernels in other multi-GPU stream groups.
|
||||
This should result in the following pattern:
|
||||
Stream 0: Regular --> Cooperative
|
||||
Stream 1: Cooperative --> Regular
|
||||
|
||||
We time how long it takes to run each of these benchmarks and print it as
|
||||
the output of the benchmark. The kernels themselves are just useless time-
|
||||
wasting code so that the kernel takes a meaningful amount of time on the
|
||||
GPU before it exits. We only launch a single wavefront for each kernel, so
|
||||
any serialization should not be because of GPU occupancy concerns.
|
||||
|
||||
If tests 2, 3, and 4 take roughly 3x as long as #1, that implies that
|
||||
cooperative kernels are serialized as expected.
|
||||
|
||||
If test #5 takes roughly twice as long as #1, that implies that the
|
||||
overlap-allowing flags work as expected.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
static constexpr size_t kBufferLen = 1024 * 1024;
|
||||
|
||||
__global__ void test_gws(uint* buf, uint buf_size, long* tmp_buf, long* result) {
|
||||
extern __shared__ long tmp[];
|
||||
uint groups = gridDim.x;
|
||||
uint group_id = blockIdx.x;
|
||||
uint local_id = threadIdx.x;
|
||||
uint chunk = gridDim.x * blockDim.x;
|
||||
|
||||
uint i = group_id * blockDim.x + local_id;
|
||||
long sum = 0;
|
||||
while (i < buf_size) {
|
||||
sum += buf[i];
|
||||
i += chunk;
|
||||
}
|
||||
tmp[local_id] = sum;
|
||||
__syncthreads();
|
||||
i = 0;
|
||||
if (local_id == 0) {
|
||||
sum = 0;
|
||||
while (i < blockDim.x) {
|
||||
sum += tmp[i];
|
||||
i++;
|
||||
}
|
||||
tmp_buf[group_id] = sum;
|
||||
}
|
||||
// wait
|
||||
cg::this_grid().sync();
|
||||
|
||||
if (((blockIdx.x * blockDim.x) + threadIdx.x) == 0) {
|
||||
for (uint i = 1; i < groups; ++i) {
|
||||
sum += tmp_buf[i];
|
||||
}
|
||||
//*result = sum;
|
||||
result[1 + cg::this_multi_grid().grid_rank()] = sum;
|
||||
}
|
||||
cg::this_multi_grid().sync();
|
||||
if (cg::this_multi_grid().grid_rank() == 0) {
|
||||
sum = 0;
|
||||
for (uint i = 1; i <= cg::this_multi_grid().num_grids(); ++i) {
|
||||
sum += result[i];
|
||||
}
|
||||
*result = sum;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void test_coop_kernel(unsigned int loops, long long* array, int fast_gpu) {
|
||||
cg::multi_grid_group mgrid = cg::this_multi_grid();
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (mgrid.grid_rank() == fast_gpu) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = clock64();
|
||||
do {
|
||||
long long cur_clock = clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < 1000000);
|
||||
array[rank] += clock64();
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void test_coop_kernel_gfx11(unsigned int loops, long long* array, int fast_gpu) {
|
||||
#if HT_AMD
|
||||
cg::multi_grid_group mgrid = cg::this_multi_grid();
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (mgrid.grid_rank() == fast_gpu) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = wall_clock64();
|
||||
do {
|
||||
long long cur_clock = wall_clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < 1000000);
|
||||
array[rank] += wall_clock64();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__global__ void test_kernel(uint32_t loops, unsigned long long* array) {
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = clock64();
|
||||
do {
|
||||
long long cur_clock = clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < 1000000);
|
||||
array[rank] += clock64();
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long* array) {
|
||||
#if HT_AMD
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = wall_clock64();
|
||||
do {
|
||||
long long cur_clock = wall_clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < 1000000);
|
||||
array[rank] += wall_clock64();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void verify_time(double single_kernel_time, double multi_kernel_time, float low_bound,
|
||||
float high_bound) {
|
||||
// Test that multiple kernel times are inside expected boundaries
|
||||
REQUIRE(multi_kernel_time >= low_bound * single_kernel_time);
|
||||
REQUIRE(multi_kernel_time <= high_bound * single_kernel_time);
|
||||
}
|
||||
|
||||
void test_multigrid_streams(int device_num) {
|
||||
uint32_t loops = 2000;
|
||||
int32_t fast_gpu = -1;
|
||||
|
||||
// We will launch enough waves to fill up all of the GPU
|
||||
int warp_sizes[2];
|
||||
int num_sms[2];
|
||||
hipDeviceProp_t device_properties[2];
|
||||
int warp_size = INT_MAX;
|
||||
int num_sm = INT_MAX;
|
||||
for (int dev = 0; dev < (device_num - 1); ++dev) {
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties[i], (dev + i)));
|
||||
warp_sizes[i] = device_properties[i].warpSize;
|
||||
if (warp_sizes[i] < warp_size) {
|
||||
warp_size = warp_sizes[i];
|
||||
}
|
||||
num_sms[i] = device_properties[i].multiProcessorCount;
|
||||
if (num_sms[i] < num_sm) {
|
||||
num_sm = num_sms[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
int max_blocks_per_sm_arr[2];
|
||||
int max_blocks_per_sm = INT_MAX;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm_arr[i],
|
||||
test_kernel_used, warp_size, 0));
|
||||
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
|
||||
max_blocks_per_sm = max_blocks_per_sm_arr[i];
|
||||
}
|
||||
}
|
||||
int desired_blocks = 1;
|
||||
|
||||
if (desired_blocks > max_blocks_per_sm * num_sm) {
|
||||
INFO("The requested number of blocks will not fit on the GPU");
|
||||
REQUIRE(desired_blocks < max_blocks_per_sm * num_sm);
|
||||
return;
|
||||
}
|
||||
|
||||
// Create the streams we will use in this test
|
||||
hipStream_t streams[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
// Set up data to pass into the kernel
|
||||
// Alocate the host input buffer, and two device-focused buffers that we
|
||||
// will use for our test.
|
||||
unsigned long long* dev_array[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
int good_size = desired_blocks * warp_size * sizeof(long long);
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]), good_size));
|
||||
HIP_CHECK(hipMemsetAsync(dev_array[i], 0, good_size, streams[i]));
|
||||
}
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
|
||||
/* Launch the kernels ****************************************************/
|
||||
void* dev_params[2][3];
|
||||
hipLaunchParams md_params[2];
|
||||
std::chrono::time_point<std::chrono::system_clock> start_time[2];
|
||||
std::chrono::time_point<std::chrono::system_clock> end_time[2];
|
||||
|
||||
// Test 0: Launching a multi-GPU cooperative kernel
|
||||
// Both GPUs launch a long cooperative kernel
|
||||
INFO("GPU " << dev << ": Long Coop Kernel");
|
||||
INFO("GPU " << (dev + 1) << ": Long Coop Kernel");
|
||||
|
||||
auto test_coop_kernel_used = IsGfx11() ? test_coop_kernel_gfx11 : test_coop_kernel;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
dev_params[i][0] = reinterpret_cast<void*>(&loops);
|
||||
dev_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
|
||||
dev_params[i][2] = reinterpret_cast<void*>(&fast_gpu);
|
||||
md_params[i].func = reinterpret_cast<void*>(test_coop_kernel_used);
|
||||
md_params[i].gridDim = desired_blocks;
|
||||
md_params[i].blockDim = warp_size;
|
||||
md_params[i].sharedMem = 0;
|
||||
md_params[i].stream = streams[i];
|
||||
md_params[i].args = dev_params[i];
|
||||
}
|
||||
|
||||
start_time[0] = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[0] = std::chrono::system_clock::now();
|
||||
|
||||
std::chrono::duration<double> single_kernel_time = (end_time[0] - start_time[0]);
|
||||
INFO("A single kernel on both GPUs took: " << single_kernel_time.count() << " seconds");
|
||||
|
||||
SECTION("GPU1 - Standard/ Long Coop, GPU2 - Coop/Standard") {
|
||||
INFO("GPU " << dev << ": Standard/Long Coop");
|
||||
INFO("GPU " << (dev + 1) << ": Coop/Standard");
|
||||
fast_gpu = 1;
|
||||
start_time[1] = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipSetDevice(dev));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
|
||||
loops, dev_array[0]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
|
||||
HIP_CHECK(hipSetDevice(dev + 1));
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
|
||||
loops, dev_array[1]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[1] = std::chrono::system_clock::now();
|
||||
std::chrono::duration<double> serialized_gpu0_time = (end_time[1] - start_time[1]);
|
||||
INFO("Serialized set of three kernels with GPU0 being long took: "
|
||||
<< serialized_gpu0_time.count() << " seconds");
|
||||
|
||||
verify_time(single_kernel_time.count(), serialized_gpu0_time.count(), 2.7f, 3.3f);
|
||||
}
|
||||
|
||||
SECTION("GPU1 - Standard/Coop, GPU2 - Long Coop/Standard") {
|
||||
INFO("GPU " << dev << ": Standard/Coop");
|
||||
INFO("GPU " << (dev + 1) << ": Long Coop/Standard");
|
||||
fast_gpu = 0;
|
||||
start_time[1] = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipSetDevice(dev));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
|
||||
loops, dev_array[0]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
|
||||
HIP_CHECK(hipSetDevice(dev + 1));
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
|
||||
loops, dev_array[1]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[1] = std::chrono::system_clock::now();
|
||||
std::chrono::duration<double> serialized_gpu1_time = (end_time[1] - start_time[1]);
|
||||
INFO("Serialized set of three kernels with GPU1 being long took: "
|
||||
<< serialized_gpu1_time.count() << " seconds");
|
||||
|
||||
verify_time(single_kernel_time.count(), serialized_gpu1_time.count(), 2.7f, 3.3f);
|
||||
}
|
||||
|
||||
SECTION(
|
||||
"GPU1 - Standard/Coop, GPU2 - Long Coop/Standard - regular and coop kernel overlap at "
|
||||
"beginning") {
|
||||
INFO("GPU " << dev << ": Standard/Coop with multi device no pre sync");
|
||||
INFO("GPU " << (dev + 1) << ": Long Coop/Standard with multi device no pre sync");
|
||||
fast_gpu = 0;
|
||||
start_time[1] = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipSetDevice(dev));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
|
||||
loops, dev_array[0]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
|
||||
hipCooperativeLaunchMultiDeviceNoPreSync));
|
||||
HIP_CHECK(hipSetDevice(dev + 1));
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
|
||||
loops, dev_array[1]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[1] = std::chrono::system_clock::now();
|
||||
std::chrono::duration<double> pre_overlapped_time = (end_time[1] - start_time[1]);
|
||||
INFO("Multiple kernels with pre-overlap allowed took: " << pre_overlapped_time.count()
|
||||
<< " seconds");
|
||||
|
||||
verify_time(single_kernel_time.count(), pre_overlapped_time.count(), 1.7f, 2.3f);
|
||||
}
|
||||
|
||||
SECTION(
|
||||
"GPU1 - Standard/Long Coop, GPU2 - Coop/Standard - regular and coop kernel overlap at "
|
||||
"end") {
|
||||
INFO("GPU " << dev << ": Standard/Long Coop with multi device no post sync");
|
||||
INFO("GPU " << (dev + 1) << ": Coop/Standard with multi device no post sync");
|
||||
fast_gpu = 1;
|
||||
start_time[1] = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipSetDevice(dev));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
|
||||
loops, dev_array[0]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
|
||||
hipCooperativeLaunchMultiDeviceNoPostSync));
|
||||
HIP_CHECK(hipSetDevice(dev + 1));
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
|
||||
loops, dev_array[1]);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[1] = std::chrono::system_clock::now();
|
||||
std::chrono::duration<double> post_overlapped_time = (end_time[1] - start_time[1]);
|
||||
INFO("Multiple kernels with post-overlap allowed took: " << post_overlapped_time.count()
|
||||
<< " seconds");
|
||||
|
||||
verify_time(single_kernel_time.count(), post_overlapped_time.count(), 1.7f, 2.3f);
|
||||
}
|
||||
|
||||
SECTION(
|
||||
"GPU1 - Standard/Long Coop, GPU2 - Long Coop/Standard - regular and coop kernel overlap") {
|
||||
INFO("GPU " << dev << ": Standard/Long Coop with multi device no pre or post sync");
|
||||
INFO("GPU " << (dev + 1) << ": Long Coop/Standard with multi device no pre or post sync");
|
||||
start_time[1] = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipSetDevice(dev));
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
|
||||
loops, dev_array[0]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(
|
||||
md_params, 2,
|
||||
hipCooperativeLaunchMultiDeviceNoPreSync | hipCooperativeLaunchMultiDeviceNoPostSync));
|
||||
HIP_CHECK(hipSetDevice(dev + 1));
|
||||
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
|
||||
loops, dev_array[1]);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
for (int i = 0; i < 2; i++) {
|
||||
HIP_CHECK(hipSetDevice(dev + i));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
end_time[1] = std::chrono::system_clock::now();
|
||||
std::chrono::duration<double> overlapped_time = (end_time[1] - start_time[1]);
|
||||
INFO("Multiple kernels with overlap allowed took: " << overlapped_time.count() << " seconds");
|
||||
|
||||
verify_time(single_kernel_time.count(), overlapped_time.count(), 1.8f, 2.2f);
|
||||
}
|
||||
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
HIP_CHECK(hipFree(dev_array[k]));
|
||||
HIP_CHECK(hipStreamDestroy(streams[k]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic") {
|
||||
constexpr uint num_kernel_args = 4;
|
||||
|
||||
int device_num = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&device_num));
|
||||
|
||||
size_t buffer_size = kBufferLen * sizeof(int);
|
||||
|
||||
int* A_h = reinterpret_cast<int*>(malloc(buffer_size * device_num));
|
||||
for (uint32_t i = 0; i < kBufferLen * device_num; ++i) {
|
||||
A_h[i] = static_cast<int>(i);
|
||||
}
|
||||
|
||||
std::vector<int*> A_d(device_num);
|
||||
std::vector<long*> B_d(device_num);
|
||||
long* C_d;
|
||||
std::vector<hipStream_t> stream(device_num);
|
||||
|
||||
std::vector<hipDeviceProp_t> device_properties(device_num);
|
||||
|
||||
for (int i = 0; i < device_num; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
|
||||
// Calculate the device occupancy to know how many blocks can be run concurrently
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties[i], 0));
|
||||
if (!device_properties[i].cooperativeMultiDeviceLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMalloc(&A_d[i], buffer_size));
|
||||
HIP_CHECK(hipMemcpy(A_d[i], &A_h[i * kBufferLen], buffer_size, hipMemcpyHostToDevice));
|
||||
if (i == 0) {
|
||||
HIP_CHECK(hipHostMalloc(&C_d, (device_num + 1) * sizeof(long)));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipStreamCreate(&stream[i]));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
}
|
||||
|
||||
dim3 dimBlock;
|
||||
dim3 dimGrid;
|
||||
dimGrid.x = 1;
|
||||
dimGrid.y = 1;
|
||||
dimGrid.z = 1;
|
||||
dimBlock.x = 64;
|
||||
dimBlock.y = 1;
|
||||
dimBlock.z = 1;
|
||||
|
||||
int num_blocks = 0;
|
||||
uint workgroup = GENERATE(64, 128, 256);
|
||||
|
||||
hipLaunchParams* launch_params_list = new hipLaunchParams[device_num];
|
||||
std::vector<void*> args(device_num * num_kernel_args);
|
||||
|
||||
for (int i = 0; i < device_num; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
|
||||
dimBlock.x = workgroup;
|
||||
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&num_blocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)));
|
||||
|
||||
INFO("GPU" << i << " has block size = " << dimBlock.x << " and num blocks per CU " << num_blocks
|
||||
<< "\n");
|
||||
|
||||
dimGrid.x = device_properties[i].multiProcessorCount * std::min(num_blocks, 32);
|
||||
|
||||
HIP_CHECK(hipMalloc(&B_d[i], dimGrid.x * sizeof(long)));
|
||||
|
||||
args[i * num_kernel_args] = (void*)&A_d[i];
|
||||
args[i * num_kernel_args + 1] = (void*)&kBufferLen;
|
||||
args[i * num_kernel_args + 2] = (void*)&B_d[i];
|
||||
args[i * num_kernel_args + 3] = (void*)&C_d;
|
||||
|
||||
launch_params_list[i].func = reinterpret_cast<void*>(test_gws);
|
||||
launch_params_list[i].gridDim = dimGrid;
|
||||
launch_params_list[i].blockDim = dimBlock;
|
||||
launch_params_list[i].sharedMem = dimBlock.x * sizeof(long);
|
||||
launch_params_list[i].stream = stream[i];
|
||||
launch_params_list[i].args = &args[i * num_kernel_args];
|
||||
}
|
||||
|
||||
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(launch_params_list, device_num, 0));
|
||||
for (int i = 0; i < device_num; i++) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream[i]));
|
||||
}
|
||||
|
||||
size_t processed_Dwords = kBufferLen * device_num;
|
||||
REQUIRE(*C_d == (((long)(processed_Dwords) * (processed_Dwords - 1)) / 2));
|
||||
|
||||
delete[] launch_params_list;
|
||||
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HIP_CHECK(hipHostFree(C_d));
|
||||
for (int i = 0; i < device_num; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
HIP_CHECK(hipFree(A_d[i]));
|
||||
HIP_CHECK(hipFree(B_d[i]));
|
||||
HIP_CHECK(hipStreamDestroy(stream[i]));
|
||||
}
|
||||
|
||||
free(A_h);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Streams") {
|
||||
int device_num = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&device_num));
|
||||
|
||||
if (device_num < 2) {
|
||||
HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
|
||||
return;
|
||||
}
|
||||
|
||||
hipDeviceProp_t device_properties;
|
||||
for (int i = 0; i < device_num; i++) {
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, i));
|
||||
if (!device_properties.cooperativeMultiDeviceLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
test_multigrid_streams(device_num);
|
||||
}
|
||||
@@ -0,0 +1,364 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
static constexpr size_t kBufferLen = 1024 * 1024;
|
||||
|
||||
__global__ void test_gws(int* buf, size_t buf_size, long* tmp_buf, long* result) {
|
||||
extern __shared__ long tmp[];
|
||||
uint offset = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
uint stride = gridDim.x * blockDim.x;
|
||||
cg::grid_group gg = cg::this_grid();
|
||||
|
||||
long sum = 0;
|
||||
for (uint i = offset; i < buf_size; i += stride) {
|
||||
sum += buf[i];
|
||||
}
|
||||
tmp[threadIdx.x] = sum;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
sum = 0;
|
||||
for (uint i = 0; i < blockDim.x; i++) {
|
||||
sum += tmp[i];
|
||||
}
|
||||
tmp_buf[blockIdx.x] = sum;
|
||||
}
|
||||
|
||||
gg.sync();
|
||||
|
||||
if (offset == 0) {
|
||||
for (uint i = 1; i < gridDim.x; ++i) {
|
||||
sum += tmp_buf[i];
|
||||
}
|
||||
*result = sum;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void test_kernel(uint32_t loops, unsigned long long* array, long long totalTicks) {
|
||||
cg::thread_block tb = cg::this_thread_block();
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = clock64();
|
||||
do {
|
||||
long long cur_clock = clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < totalTicks);
|
||||
tb.sync();
|
||||
array[rank] += clock64();
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long* array, long long totalTicks) {
|
||||
#if HT_AMD
|
||||
cg::thread_block tb = cg::this_thread_block();
|
||||
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
for (int i = 0; i < loops; i++) {
|
||||
long long time_diff = 0;
|
||||
long long last_clock = wall_clock64();
|
||||
do {
|
||||
long long cur_clock = wall_clock64();
|
||||
if (cur_clock > last_clock) {
|
||||
time_diff += (cur_clock - last_clock);
|
||||
}
|
||||
// If it rolls over, we don't know how much to add to catch up.
|
||||
// So just ignore those slipped cycles.
|
||||
last_clock = cur_clock;
|
||||
} while (time_diff < totalTicks);
|
||||
tb.sync();
|
||||
array[rank] += wall_clock64();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void verifyLeastCapacity(T& single_kernel_time, T& double_kernel_time,
|
||||
T& triple_kernel_time) {
|
||||
#if HT_AMD
|
||||
// hipLaunchCooperativeKernel() follows serialization policy on AMD devices
|
||||
// Test that the two cooperative kernels took roughly twice as long as the one
|
||||
REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count());
|
||||
REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count());
|
||||
#else
|
||||
// hipLaunchCooperativeKernel() doesn't follow serialization policy on NV devices
|
||||
// Test that the two cooperative kernels took roughly as long as the one
|
||||
REQUIRE(double_kernel_time.count() >= 0.8 * single_kernel_time.count());
|
||||
REQUIRE(double_kernel_time.count() <= 1.2 * single_kernel_time.count());
|
||||
#endif
|
||||
|
||||
// Test that the three kernels together took roughly as long as the two
|
||||
// cooperative kernels.
|
||||
REQUIRE(triple_kernel_time.count() <= 1.1 * double_kernel_time.count());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void verifyHalfCapacity(T& single_kernel_time, T& double_kernel_time,
|
||||
T& triple_kernel_time) {
|
||||
// Test that the two cooperative kernels took roughly twice as long as the one
|
||||
REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count());
|
||||
REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count());
|
||||
|
||||
// Test that the three kernels together took roughly as long as the two
|
||||
// cooperative kernels.
|
||||
REQUIRE(triple_kernel_time.count() <= 1.1 * double_kernel_time.count());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void verifyFullCapacity(T& single_kernel_time, T& double_kernel_time,
|
||||
T& triple_kernel_time) {
|
||||
// Test that the two cooperative kernels took roughly twice as long as the one
|
||||
REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count());
|
||||
REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count());
|
||||
|
||||
// Test that the three kernels together took roughly 1.6 times as long as the two
|
||||
// cooperative kernels. If the first 2 kernels run very fast, the third
|
||||
// won't share much time with the second kernel.
|
||||
REQUIRE(triple_kernel_time.count() <= 1.7 * double_kernel_time.count());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void verify(int tests, T& single_kernel_time, T& double_kernel_time, T& triple_kernel_time) {
|
||||
switch (tests) {
|
||||
case 0:
|
||||
verifyLeastCapacity(single_kernel_time, double_kernel_time, triple_kernel_time);
|
||||
break;
|
||||
case 1:
|
||||
verifyHalfCapacity(single_kernel_time, double_kernel_time, triple_kernel_time);
|
||||
break;
|
||||
case 2:
|
||||
verifyFullCapacity(single_kernel_time, double_kernel_time, triple_kernel_time);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void test_cooperative_streams(int dev, int p_tests) {
|
||||
hipStream_t streams[3];
|
||||
unsigned long long* dev_array[3];
|
||||
int loops = 1000;
|
||||
|
||||
HIP_CHECK(hipSetDevice(dev));
|
||||
hipDeviceProp_t device_properties;
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, dev));
|
||||
|
||||
// Test whether target device supports cooperative groups
|
||||
if (device_properties.cooperativeLaunch == 0) {
|
||||
std::cout << "Cooperative group support not available in device " << dev << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
// We will launch enough waves to fill up all of the GPU
|
||||
int warp_size = device_properties.warpSize;
|
||||
int num_sms = device_properties.multiProcessorCount;
|
||||
long long totalTicks = device_properties.clockRate;
|
||||
int max_blocks_per_sm = 0;
|
||||
// Calculate the device occupancy to know how many blocks can be run.
|
||||
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
|
||||
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, test_kernel_used,
|
||||
warp_size, 0));
|
||||
int max_active_blocks = max_blocks_per_sm * num_sms;
|
||||
int coop_blocks = 0;
|
||||
int reg_blocks = 0;
|
||||
|
||||
switch (p_tests) {
|
||||
case 0:
|
||||
// 1 block
|
||||
coop_blocks = 1;
|
||||
reg_blocks = 1;
|
||||
break;
|
||||
case 1:
|
||||
// Half capacity
|
||||
// To make sure the second kernel launched by hipLaunchCooperativeKernel
|
||||
// is invoked after the first kernel finished
|
||||
coop_blocks = max_active_blocks / 2 + 1;
|
||||
// To make sure the third kernel launched by hipLaunchKernelGGL is invoked
|
||||
// concurrently with the second kernel
|
||||
reg_blocks = max_active_blocks - coop_blocks;
|
||||
break;
|
||||
case 2:
|
||||
// Full capacity
|
||||
coop_blocks = max_active_blocks;
|
||||
reg_blocks = max_active_blocks;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
HIP_CHECK(hipStreamCreate(&streams[i]));
|
||||
}
|
||||
|
||||
// Set up data to pass into the kernel
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]), warp_size * sizeof(long long)));
|
||||
HIP_CHECK(hipMemsetAsync(dev_array[i], 0, warp_size * sizeof(long long), streams[i]));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
// Launch the kernels
|
||||
void* coop_params[3][3];
|
||||
for (int i = 0; i < 3; i++) {
|
||||
coop_params[i][0] = reinterpret_cast<void*>(&loops);
|
||||
coop_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
|
||||
coop_params[i][2] = reinterpret_cast<void*>(&totalTicks);
|
||||
}
|
||||
|
||||
// We need exclude the the initial launching as it will need time to load code obj.
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), max_active_blocks,
|
||||
warp_size, coop_params[0], 0, streams[0]));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
// Launching a single cooperative kernel
|
||||
auto single_start = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), max_active_blocks,
|
||||
warp_size, coop_params[0], 0, streams[0]));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
auto single_end = std::chrono::system_clock::now();
|
||||
|
||||
std::chrono::duration<double> single_kernel_time = (single_end - single_start);
|
||||
|
||||
// Launching 2 cooperative kernels to different streams
|
||||
auto double_start = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
|
||||
warp_size, coop_params[0], 0, streams[0]));
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
|
||||
warp_size, coop_params[1], 0, streams[1]));
|
||||
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
auto double_end = std::chrono::system_clock::now();
|
||||
|
||||
// Launching 2 cooperative kernels and 1 normal kernel
|
||||
std::chrono::duration<double> double_kernel_time = (double_end - double_start);
|
||||
|
||||
auto triple_start = std::chrono::system_clock::now();
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
|
||||
warp_size, coop_params[0], 0, streams[0]));
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
|
||||
warp_size, coop_params[1], 0, streams[1]));
|
||||
hipLaunchKernelGGL(test_kernel_used, dim3(reg_blocks), dim3(warp_size), 0, streams[2], loops,
|
||||
dev_array[2], totalTicks);
|
||||
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
auto triple_end = std::chrono::system_clock::now();
|
||||
std::chrono::duration<double> triple_kernel_time = (triple_end - triple_start);
|
||||
|
||||
for (int k = 0; k < 3; ++k) {
|
||||
HIP_CHECK(hipFree(dev_array[k]));
|
||||
HIP_CHECK(hipStreamDestroy(streams[k]));
|
||||
}
|
||||
|
||||
|
||||
INFO("A single kernel took : " << single_kernel_time.count() << " seconds");
|
||||
INFO("Two cooperative kernels took: " << double_kernel_time.count() << " seconds");
|
||||
INFO("Two coop kernels and a third regular kernel took: " << triple_kernel_time.count()
|
||||
<< " seconds");
|
||||
|
||||
verify(p_tests, single_kernel_time, double_kernel_time, triple_kernel_time);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") {
|
||||
// Use default device for validating the test
|
||||
int device;
|
||||
int *A_h, *A_d;
|
||||
long* B_d;
|
||||
long* C_d;
|
||||
hipDeviceProp_t device_properties;
|
||||
HIP_CHECK(hipGetDevice(&device));
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
||||
|
||||
if (!device_properties.cooperativeLaunch) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
|
||||
return;
|
||||
}
|
||||
|
||||
size_t buffer_size = kBufferLen * sizeof(int);
|
||||
|
||||
A_h = reinterpret_cast<int*>(malloc(buffer_size));
|
||||
for (uint32_t i = 0; i < kBufferLen; ++i) {
|
||||
A_h[i] = static_cast<int>(i);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMalloc(&A_d, buffer_size));
|
||||
HIP_CHECK(hipMemcpy(A_d, A_h, buffer_size, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipHostMalloc(&C_d, sizeof(long)));
|
||||
|
||||
hipStream_t stream;
|
||||
HIPCHECK(hipStreamCreate(&stream));
|
||||
|
||||
dim3 dimBlock = dim3(1);
|
||||
dim3 dimGrid = dim3(1);
|
||||
int numBlocks = 0;
|
||||
|
||||
uint32_t workgroup = GENERATE(32, 64, 128, 256);
|
||||
|
||||
dimBlock.x = workgroup;
|
||||
|
||||
// Calculate the device occupancy to know how many blocks can be run concurrently
|
||||
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)));
|
||||
|
||||
dimGrid.x = device_properties.multiProcessorCount * std::min(numBlocks, 32);
|
||||
HIP_CHECK(hipMalloc(&B_d, dimGrid.x * sizeof(long)));
|
||||
|
||||
void* params[4];
|
||||
params[0] = (void*)&A_d;
|
||||
params[1] = (void*)&kBufferLen;
|
||||
params[2] = (void*)&B_d;
|
||||
params[3] = (void*)&C_d;
|
||||
|
||||
INFO("Testing with grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n");
|
||||
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_gws), dimGrid, dimBlock, params,
|
||||
dimBlock.x * sizeof(long), stream));
|
||||
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
REQUIRE(((unsigned long long)*C_d) == (((unsigned long long)(kBufferLen) * (kBufferLen - 1)) / 2));
|
||||
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
HIP_CHECK(hipHostFree(C_d));
|
||||
HIP_CHECK(hipFree(B_d));
|
||||
HIP_CHECK(hipFree(A_d));
|
||||
free(A_h);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipLaunchCooperativeKernel_Streams") {
|
||||
const auto device = GENERATE(range(0, HipTest::getDeviceCount()));
|
||||
int p_tests = GENERATE(0, 1, 2);
|
||||
|
||||
test_cooperative_streams(device, p_tests);
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
|
||||
#define ASSERT_EQUAL(lhs, rhs) HIP_ASSERT(lhs == rhs)
|
||||
#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
|
||||
#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
|
||||
|
||||
constexpr int MaxGPUs = 8;
|
||||
|
||||
template <typename T>
|
||||
void printResults(T* ptr, int size) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
std::cout << ptr[i] << " ";
|
||||
}
|
||||
std::cout << '\n';
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void compareResults(T* cpu, T* gpu, int size) {
|
||||
for (unsigned int i = 0; i < size / sizeof(T); i++) {
|
||||
if (cpu[i] != gpu[i]) {
|
||||
INFO("Results do not match at index " << i);
|
||||
REQUIRE(cpu[i] == gpu[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Search if the sum exists in the expected results array
|
||||
template <typename T>
|
||||
void verifyResults(T* hPtr, T* dPtr, int size) {
|
||||
int i = 0, j = 0;
|
||||
for (i = 0; i < size; i++) {
|
||||
for (j = 0; j < size; j++) {
|
||||
if (hPtr[i] == dPtr[j]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (j == size) {
|
||||
INFO("Result verification failed!");
|
||||
REQUIRE(j != size);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
set(TEST_SRC
|
||||
hipGLGetDevices.cc
|
||||
hipGraphicsGLRegisterBuffer.cc
|
||||
hipGraphicsGLRegisterImage.cc
|
||||
hipGraphicsMapResources.cc
|
||||
hipGraphicsSubResourceGetMappedArray.cc
|
||||
hipGraphicsResourceGetMappedPointer.cc
|
||||
hipGraphicsUnmapResources.cc
|
||||
hipGraphicsUnregisterResource.cc
|
||||
)
|
||||
|
||||
find_package(OpenGL COMPONENTS OpenGL EGL)
|
||||
message(STATUS "OpenGL_FOUND: ${OpenGL_FOUND}")
|
||||
if(NOT OpenGL_FOUND)
|
||||
message(STATUS "OpenGL not found, OpenGL interop tests not enabled.")
|
||||
return()
|
||||
endif()
|
||||
|
||||
find_package(GLUT)
|
||||
message(STATUS "GLUT_FOUND: ${GLUT_FOUND}")
|
||||
if(NOT GLUT_FOUND)
|
||||
message(STATUS "GLUT not found, OpenGL interop tests not enabled.")
|
||||
return()
|
||||
endif()
|
||||
|
||||
hip_add_exe_to_target(NAME GLInteropTest
|
||||
TEST_SRC ${TEST_SRC}
|
||||
TEST_TARGET_NAME build_tests
|
||||
COMPILE_OPTIONS -std=c++17)
|
||||
target_link_libraries(GLInteropTest OpenGL::GL OpenGL::EGL GLUT::GLUT)
|
||||
@@ -0,0 +1,219 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <variant>
|
||||
|
||||
#define GL_GLEXT_PROTOTYPES
|
||||
#include <GL/freeglut.h>
|
||||
#include <GL/freeglut_ext.h>
|
||||
|
||||
#include <EGL/egl.h>
|
||||
#include <EGL/eglext.h>
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
class GLBufferObject {
|
||||
public:
|
||||
static constexpr size_t kSize = 512 * 512 * 4 * sizeof(float);
|
||||
|
||||
GLBufferObject() {
|
||||
glGenBuffers(1, &vbo_);
|
||||
glBindBuffer(GL_ARRAY_BUFFER, vbo_);
|
||||
glBufferData(GL_ARRAY_BUFFER, kSize, 0, GL_DYNAMIC_DRAW);
|
||||
glBindBuffer(GL_ARRAY_BUFFER, 0);
|
||||
REQUIRE(glGetError() == GL_NO_ERROR);
|
||||
}
|
||||
|
||||
~GLBufferObject() { glDeleteBuffers(1, &vbo_); }
|
||||
|
||||
operator GLuint() const { return vbo_; }
|
||||
|
||||
private:
|
||||
GLuint vbo_;
|
||||
};
|
||||
|
||||
class GLImageObject {
|
||||
public:
|
||||
static constexpr size_t kWidth = 512, kHeight = 512;
|
||||
|
||||
GLImageObject() {
|
||||
glGenTextures(1, &tex_);
|
||||
glBindTexture(GL_TEXTURE_2D, tex_);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
|
||||
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, kWidth, kHeight, 0, GL_RGBA_INTEGER_EXT,
|
||||
GL_UNSIGNED_BYTE, NULL);
|
||||
REQUIRE(glGetError() == GL_NO_ERROR);
|
||||
}
|
||||
|
||||
~GLImageObject() { glDeleteTextures(1, &tex_); }
|
||||
|
||||
operator GLuint() const { return tex_; }
|
||||
|
||||
private:
|
||||
GLuint tex_;
|
||||
};
|
||||
|
||||
static std::once_flag glut_init_flag;
|
||||
|
||||
class GLUTContextScopeGuard {
|
||||
public:
|
||||
GLUTContextScopeGuard() {
|
||||
std::call_once(glut_init_flag, &GLUTContextScopeGuard::init);
|
||||
glut_window_ = glutCreateWindow("");
|
||||
}
|
||||
|
||||
~GLUTContextScopeGuard() { glutDestroyWindow(glut_window_); }
|
||||
|
||||
GLUTContextScopeGuard(const GLUTContextScopeGuard&) = delete;
|
||||
GLUTContextScopeGuard& operator=(const GLUTContextScopeGuard&) = delete;
|
||||
|
||||
GLUTContextScopeGuard(GLUTContextScopeGuard&&) = delete;
|
||||
GLUTContextScopeGuard& operator=(GLUTContextScopeGuard&&) = delete;
|
||||
|
||||
private:
|
||||
int glut_window_;
|
||||
|
||||
static void init() {
|
||||
static char proc_name[] = "";
|
||||
static std::array<char*, 2> glut_argv = {proc_name, nullptr};
|
||||
static int glut_argc = 1;
|
||||
|
||||
glutInit(&glut_argc, glut_argv.data());
|
||||
glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE | GLUT_DEPTH);
|
||||
glutInitWindowSize(512, 512);
|
||||
}
|
||||
};
|
||||
|
||||
class EGLContextScopeGuard {
|
||||
public:
|
||||
EGLContextScopeGuard() {
|
||||
// 1. Initialize EGL
|
||||
PFNEGLQUERYDEVICESEXTPROC eglQueryDevicesEXT =
|
||||
(PFNEGLQUERYDEVICESEXTPROC)eglGetProcAddress("eglQueryDevicesEXT");
|
||||
|
||||
eglQueryDevicesEXT(egl_devices_.max_size(), egl_devices_.data(), &num_devices_);
|
||||
|
||||
INFO("Detected " << num_devices_ << " devices");
|
||||
|
||||
PFNEGLGETPLATFORMDISPLAYEXTPROC eglGetPlatformDisplayEXT =
|
||||
(PFNEGLGETPLATFORMDISPLAYEXTPROC)eglGetProcAddress("eglGetPlatformDisplayEXT");
|
||||
|
||||
egl_display_ = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, egl_devices_.at(0), 0);
|
||||
|
||||
REQUIRE(eglInitialize(egl_display_, &major_, &minor_));
|
||||
|
||||
// 2. Select an appropriate configuration
|
||||
REQUIRE(eglChooseConfig(egl_display_, kConfigAttribs, &egl_config_, 1, &num_configs_));
|
||||
|
||||
// 3. Create a surface
|
||||
egl_surface_ = eglCreatePbufferSurface(egl_display_, egl_config_, kPbufferAttribs);
|
||||
|
||||
// 4. Bind the API
|
||||
REQUIRE(eglBindAPI(EGL_OPENGL_API));
|
||||
|
||||
// 5. Create a context and make it current
|
||||
egl_context_ = eglCreateContext(egl_display_, egl_config_, EGL_NO_CONTEXT, NULL);
|
||||
|
||||
REQUIRE(eglMakeCurrent(egl_display_, egl_surface_, egl_surface_, egl_context_));
|
||||
}
|
||||
|
||||
~EGLContextScopeGuard() {
|
||||
// 6. Terminate EGL when finished
|
||||
eglTerminate(egl_display_);
|
||||
}
|
||||
|
||||
EGLContextScopeGuard(const EGLContextScopeGuard&) = delete;
|
||||
EGLContextScopeGuard& operator=(const EGLContextScopeGuard&) = delete;
|
||||
|
||||
EGLContextScopeGuard(EGLContextScopeGuard&&) = delete;
|
||||
EGLContextScopeGuard& operator=(EGLContextScopeGuard&&) = delete;
|
||||
|
||||
private:
|
||||
// clang-format off
|
||||
static constexpr EGLint kConfigAttribs[] = {
|
||||
EGL_SURFACE_TYPE,
|
||||
EGL_PBUFFER_BIT,
|
||||
EGL_BLUE_SIZE, 8,
|
||||
EGL_GREEN_SIZE, 8,
|
||||
EGL_RED_SIZE, 8,
|
||||
EGL_DEPTH_SIZE, 8,
|
||||
EGL_RENDERABLE_TYPE,
|
||||
EGL_OPENGL_BIT,
|
||||
EGL_NONE
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
static constexpr int kPbufferWidth = 9;
|
||||
static constexpr int kPbufferHeight = 9;
|
||||
|
||||
static constexpr EGLint kPbufferAttribs[] = {
|
||||
EGL_WIDTH, kPbufferWidth, EGL_HEIGHT, kPbufferHeight, EGL_NONE,
|
||||
};
|
||||
|
||||
std::array<EGLDeviceEXT, 8> egl_devices_;
|
||||
EGLint num_devices_;
|
||||
EGLDisplay egl_display_;
|
||||
EGLint major_, minor_;
|
||||
EGLint num_configs_;
|
||||
EGLConfig egl_config_;
|
||||
EGLSurface egl_surface_;
|
||||
EGLContext egl_context_;
|
||||
};
|
||||
|
||||
class GLContextScopeGuard {
|
||||
public:
|
||||
using GLUTContextScopeGuardPtr = std::unique_ptr<GLUTContextScopeGuard>;
|
||||
using EGLContextScopeGuardPtr = std::unique_ptr<EGLContextScopeGuard>;
|
||||
using GLContextScopeGuardVariant =
|
||||
std::variant<GLUTContextScopeGuardPtr, EGLContextScopeGuardPtr>;
|
||||
|
||||
static constexpr char kEnvarName[] = "GL_CONTEXT_TYPE";
|
||||
|
||||
GLContextScopeGuard() {
|
||||
char* val = std::getenv(kEnvarName);
|
||||
std::string val_str = val == NULL ? "" : val;
|
||||
|
||||
if (val_str.empty() || val_str == "GLUT") {
|
||||
gl_context_ = std::make_unique<GLUTContextScopeGuard>();
|
||||
} else if (val_str == "EGL") {
|
||||
gl_context_ = std::make_unique<EGLContextScopeGuard>();
|
||||
} else {
|
||||
INFO("Unsupported " << kEnvarName << " value '" << val_str << "'");
|
||||
INFO("Supported values are ['GLUT', 'EGL']");
|
||||
REQUIRE(false);
|
||||
}
|
||||
}
|
||||
|
||||
GLContextScopeGuard(const GLContextScopeGuard&) = delete;
|
||||
GLContextScopeGuard& operator=(const GLContextScopeGuard&) = delete;
|
||||
|
||||
GLContextScopeGuard(GLContextScopeGuard&&) = delete;
|
||||
GLContextScopeGuard& operator=(GLContextScopeGuard&&) = delete;
|
||||
|
||||
private:
|
||||
GLContextScopeGuardVariant gl_context_;
|
||||
};
|
||||
@@ -0,0 +1,90 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_gl_interop.h>
|
||||
|
||||
#include "gl_interop_common.hh"
|
||||
|
||||
namespace {
|
||||
constexpr std::array<hipGLDeviceList, 3> kDeviceLists{
|
||||
hipGLDeviceListAll, hipGLDeviceListCurrentFrame, hipGLDeviceListNextFrame};
|
||||
} // anonymous namespace
|
||||
|
||||
TEST_CASE("Unit_hipGLGetDevices_Positive_Basic") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
const auto device_list = GENERATE(from_range(begin(kDeviceLists), end(kDeviceLists)));
|
||||
|
||||
const int device_count = HipTest::getDeviceCount();
|
||||
|
||||
unsigned int gl_device_count = 0;
|
||||
std::vector<int> gl_devices(device_count, -1);
|
||||
|
||||
HIP_CHECK(hipGLGetDevices(&gl_device_count, gl_devices.data(), device_count, device_list));
|
||||
|
||||
REQUIRE(gl_device_count == 1);
|
||||
REQUIRE(gl_devices.at(0) == 0);
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipGLGetDevices_Positive_Parameters") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
const int device_count = HipTest::getDeviceCount();
|
||||
|
||||
unsigned int gl_device_count = 0;
|
||||
std::vector<int> gl_devices(device_count, -1);
|
||||
|
||||
SECTION("pHipDeviceCount == nullptr") {
|
||||
HIP_CHECK(hipGLGetDevices(nullptr, gl_devices.data(), device_count, hipGLDeviceListAll));
|
||||
REQUIRE(gl_devices.at(0) == 0);
|
||||
}
|
||||
|
||||
SECTION("pHipDevices == nullptr") {
|
||||
HIP_CHECK(hipGLGetDevices(&gl_device_count, nullptr, device_count, hipGLDeviceListAll));
|
||||
REQUIRE(gl_device_count == 1);
|
||||
}
|
||||
|
||||
SECTION("hipDeviceCount == 0") {
|
||||
HIP_CHECK(hipGLGetDevices(&gl_device_count, gl_devices.data(), 0, hipGLDeviceListAll));
|
||||
REQUIRE(gl_device_count == 1);
|
||||
REQUIRE(gl_devices.at(0) == -1);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipGLGetDevices_Negative_Parameters") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
const int device_count = HipTest::getDeviceCount();
|
||||
|
||||
unsigned int gl_device_count = 0;
|
||||
std::vector<int> gl_devices(device_count, -1);
|
||||
|
||||
SECTION("invalid deviceList") {
|
||||
HIP_CHECK_ERROR(hipGLGetDevices(&gl_device_count, gl_devices.data(), device_count,
|
||||
static_cast<hipGLDeviceList>(-1)),
|
||||
hipErrorInvalidValue);
|
||||
REQUIRE(gl_device_count == 0);
|
||||
REQUIRE(gl_devices.at(0) == -1);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,98 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_gl_interop.h>
|
||||
|
||||
#include "gl_interop_common.hh"
|
||||
|
||||
namespace {
|
||||
constexpr std::array<unsigned int, 3> kFlags{hipGraphicsRegisterFlagsNone,
|
||||
hipGraphicsRegisterFlagsReadOnly,
|
||||
hipGraphicsRegisterFlagsWriteDiscard};
|
||||
} // anonymous namespace
|
||||
|
||||
TEST_CASE("Unit_hipGraphicsGLRegisterBuffer_Positive_Basic") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
const auto flags = GENERATE(from_range(begin(kFlags), end(kFlags)));
|
||||
|
||||
GLBufferObject vbo;
|
||||
|
||||
hipGraphicsResource* vbo_resource;
|
||||
|
||||
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, flags));
|
||||
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipGraphicsGLRegisterBuffer_Positive_Register_Twice") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
GLBufferObject vbo;
|
||||
|
||||
hipGraphicsResource *vbo_resource_1, *vbo_resource_2;
|
||||
|
||||
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource_1, vbo, hipGraphicsRegisterFlagsNone));
|
||||
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource_2, vbo, hipGraphicsRegisterFlagsNone));
|
||||
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource_1));
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource_2));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipGraphicsGLRegisterBuffer_Negative_Parameters") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
GLBufferObject vbo;
|
||||
|
||||
hipGraphicsResource* vbo_resource;
|
||||
|
||||
SECTION("resource == nullptr") {
|
||||
HIP_CHECK_ERROR(hipGraphicsGLRegisterBuffer(nullptr, vbo, hipGraphicsRegisterFlagsNone),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("invalid buffer") {
|
||||
HIP_CHECK_ERROR(
|
||||
hipGraphicsGLRegisterBuffer(&vbo_resource, GLuint{}, hipGraphicsRegisterFlagsNone),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("invalid flags") {
|
||||
HIP_CHECK_ERROR(
|
||||
hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, std::numeric_limits<unsigned int>::max()),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("flags == hipGraphicsRegisterFlagsSurfaceLoadStore") {
|
||||
HIP_CHECK_ERROR(
|
||||
hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsSurfaceLoadStore),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("flags == hipGraphicsRegisterFlagsTextureGather") {
|
||||
HIP_CHECK_ERROR(
|
||||
hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsTextureGather),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_gl_interop.h>
|
||||
|
||||
#include "gl_interop_common.hh"
|
||||
|
||||
namespace {
|
||||
constexpr std::array<unsigned int, 5> kFlags{
|
||||
hipGraphicsRegisterFlagsNone, hipGraphicsRegisterFlagsReadOnly,
|
||||
hipGraphicsRegisterFlagsWriteDiscard, hipGraphicsRegisterFlagsSurfaceLoadStore,
|
||||
hipGraphicsRegisterFlagsTextureGather};
|
||||
} // anonymous namespace
|
||||
|
||||
TEST_CASE("Unit_hipGraphicsGLRegisterImage_Positive_Basic") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
const auto flags = GENERATE(from_range(begin(kFlags), end(kFlags)));
|
||||
|
||||
GLImageObject tex;
|
||||
|
||||
hipGraphicsResource* tex_resource;
|
||||
|
||||
HIP_CHECK(hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D, flags));
|
||||
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(tex_resource));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipGraphicsGLRegisterImage_Positive_Register_Twice") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
GLImageObject tex;
|
||||
|
||||
hipGraphicsResource *tex_resource_1, *tex_resource_2;
|
||||
|
||||
HIP_CHECK(hipGraphicsGLRegisterImage(&tex_resource_1, tex, GL_TEXTURE_2D,
|
||||
hipGraphicsRegisterFlagsNone));
|
||||
HIP_CHECK(hipGraphicsGLRegisterImage(&tex_resource_2, tex, GL_TEXTURE_2D,
|
||||
hipGraphicsRegisterFlagsNone));
|
||||
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(tex_resource_1));
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(tex_resource_2));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipGraphicsGLRegisterImage_Negative_Parameters") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
GLImageObject tex;
|
||||
|
||||
hipGraphicsResource* tex_resource;
|
||||
|
||||
SECTION("resource == nullptr") {
|
||||
HIP_CHECK_ERROR(
|
||||
hipGraphicsGLRegisterImage(nullptr, tex, GL_TEXTURE_2D, hipGraphicsRegisterFlagsNone),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("invalid image") {
|
||||
HIP_CHECK_ERROR(hipGraphicsGLRegisterImage(&tex_resource, GLuint{}, GL_TEXTURE_2D,
|
||||
hipGraphicsRegisterFlagsNone),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("invalid target") {
|
||||
HIP_CHECK_ERROR(
|
||||
hipGraphicsGLRegisterImage(&tex_resource, tex, GL_BUFFER, hipGraphicsRegisterFlagsNone),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("target does not match the object") {
|
||||
HIP_CHECK_ERROR(hipGraphicsGLRegisterImage(&tex_resource, tex, GL_RENDERBUFFER,
|
||||
hipGraphicsRegisterFlagsNone),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("invalid flags") {
|
||||
HIP_CHECK_ERROR(hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D,
|
||||
std::numeric_limits<unsigned int>::max()),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_gl_interop.h>
|
||||
|
||||
#include "gl_interop_common.hh"
|
||||
|
||||
TEST_CASE("Unit_hipGraphicsMapResources_Positive_Basic") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
GLBufferObject vbo;
|
||||
GLImageObject tex;
|
||||
|
||||
std::array<hipGraphicsResource_t, 2> resources;
|
||||
|
||||
HIP_CHECK(hipGraphicsGLRegisterBuffer(&resources.at(0), vbo, hipGraphicsRegisterFlagsNone));
|
||||
HIP_CHECK(hipGraphicsGLRegisterImage(&resources.at(1), tex, GL_TEXTURE_2D,
|
||||
hipGraphicsRegisterFlagsNone));
|
||||
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
HIP_CHECK(hipGraphicsMapResources(resources.size(), resources.data(), stream));
|
||||
|
||||
HIP_CHECK(hipGraphicsUnmapResources(resources.size(), resources.data(), stream));
|
||||
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(resources.at(0)));
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(resources.at(1)));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipGraphicsMapResources_Negative_Parameters") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
GLBufferObject vbo;
|
||||
|
||||
hipGraphicsResource* vbo_resource;
|
||||
|
||||
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
|
||||
|
||||
SECTION("count == 0") {
|
||||
HIP_CHECK_ERROR(hipGraphicsMapResources(0, &vbo_resource, 0), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("resources == nullptr") {
|
||||
HIP_CHECK_ERROR(hipGraphicsMapResources(1, nullptr, 0), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("unregistered resource") {
|
||||
hipGraphicsResource* unregistered_resource;
|
||||
HIP_CHECK(
|
||||
hipGraphicsGLRegisterBuffer(&unregistered_resource, vbo, hipGraphicsRegisterFlagsNone));
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(unregistered_resource));
|
||||
HIP_CHECK_ERROR(hipGraphicsMapResources(1, &unregistered_resource, 0), hipErrorInvalidHandle);
|
||||
}
|
||||
|
||||
SECTION("already mapped resource") {
|
||||
HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
|
||||
HIP_CHECK_ERROR(hipGraphicsMapResources(1, &vbo_resource, 0), hipErrorAlreadyMapped);
|
||||
HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
|
||||
}
|
||||
|
||||
SECTION("invalid stream") {
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
HIP_CHECK_ERROR(hipGraphicsMapResources(1, &vbo_resource, stream), hipErrorContextIsDestroyed);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
|
||||
}
|
||||
@@ -0,0 +1,151 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_gl_interop.h>
|
||||
|
||||
#include "gl_interop_common.hh"
|
||||
|
||||
TEST_CASE("Unit_hipGraphicsResourceGetMappedPointer_Positive_Basic") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
GLBufferObject vbo;
|
||||
|
||||
hipGraphicsResource* vbo_resource;
|
||||
|
||||
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
|
||||
|
||||
HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
|
||||
|
||||
float* buffer_devptr = nullptr;
|
||||
size_t size = 0;
|
||||
|
||||
HIP_CHECK(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr), &size,
|
||||
vbo_resource));
|
||||
|
||||
REQUIRE(buffer_devptr != nullptr);
|
||||
REQUIRE(size == vbo.kSize);
|
||||
|
||||
HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
|
||||
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipGraphicsResourceGetMappedPointer_Positive_Parameters") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
GLBufferObject vbo;
|
||||
|
||||
hipGraphicsResource* vbo_resource;
|
||||
|
||||
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
|
||||
|
||||
HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
|
||||
|
||||
float* buffer_devptr = nullptr;
|
||||
size_t size = 0;
|
||||
|
||||
SECTION("devPtr == nullptr") {
|
||||
HIP_CHECK(hipGraphicsResourceGetMappedPointer(nullptr, &size, vbo_resource));
|
||||
REQUIRE(size == vbo.kSize);
|
||||
}
|
||||
|
||||
SECTION("size == nullptr") {
|
||||
HIP_CHECK(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr), nullptr,
|
||||
vbo_resource));
|
||||
REQUIRE(buffer_devptr != nullptr);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
|
||||
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipGraphicsResourceGetMappedPointer_Negative_Parameters") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
GLBufferObject vbo;
|
||||
|
||||
hipGraphicsResource* vbo_resource;
|
||||
|
||||
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
|
||||
|
||||
HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
|
||||
|
||||
float* buffer_devptr = nullptr;
|
||||
size_t size = 0;
|
||||
|
||||
SECTION("non-pointer resource") {
|
||||
GLImageObject tex;
|
||||
hipGraphicsResource* tex_resource;
|
||||
|
||||
HIP_CHECK(hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D,
|
||||
hipGraphicsRegisterFlagsNone));
|
||||
HIP_CHECK(hipGraphicsMapResources(1, &tex_resource, 0));
|
||||
|
||||
HIP_CHECK_ERROR(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr),
|
||||
&size, tex_resource),
|
||||
hipErrorNotMappedAsPointer);
|
||||
|
||||
HIP_CHECK(hipGraphicsUnmapResources(1, &tex_resource, 0));
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(tex_resource));
|
||||
}
|
||||
|
||||
SECTION("unregistered resource") {
|
||||
hipGraphicsResource* unregistered_resource;
|
||||
HIP_CHECK(
|
||||
hipGraphicsGLRegisterBuffer(&unregistered_resource, vbo, hipGraphicsRegisterFlagsNone));
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(unregistered_resource));
|
||||
HIP_CHECK_ERROR(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr),
|
||||
&size, unregistered_resource),
|
||||
hipErrorContextIsDestroyed);
|
||||
}
|
||||
|
||||
SECTION("not mapped resource") {
|
||||
hipGraphicsResource* not_mapped_resource;
|
||||
HIP_CHECK(hipGraphicsGLRegisterBuffer(¬_mapped_resource, vbo, hipGraphicsRegisterFlagsNone));
|
||||
HIP_CHECK_ERROR(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr),
|
||||
&size, not_mapped_resource),
|
||||
hipErrorNotMapped);
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(not_mapped_resource));
|
||||
}
|
||||
|
||||
SECTION("unmapped resource") {
|
||||
hipGraphicsResource* unmapped_resource;
|
||||
|
||||
HIP_CHECK(hipGraphicsGLRegisterBuffer(&unmapped_resource, vbo, hipGraphicsRegisterFlagsNone));
|
||||
|
||||
HIP_CHECK(hipGraphicsMapResources(1, &unmapped_resource, 0));
|
||||
HIP_CHECK(hipGraphicsUnmapResources(1, &unmapped_resource, 0));
|
||||
|
||||
HIP_CHECK_ERROR(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr),
|
||||
&size, unmapped_resource),
|
||||
hipErrorNotMapped);
|
||||
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(unmapped_resource));
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
|
||||
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
|
||||
}
|
||||
@@ -0,0 +1,132 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_gl_interop.h>
|
||||
|
||||
#include "gl_interop_common.hh"
|
||||
|
||||
TEST_CASE("Unit_hipGraphicsSubResourceGetMappedArray_Positive_Basic") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
GLImageObject tex;
|
||||
|
||||
hipGraphicsResource* tex_resource;
|
||||
|
||||
HIP_CHECK(
|
||||
hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D, hipGraphicsRegisterFlagsNone));
|
||||
|
||||
HIP_CHECK(hipGraphicsMapResources(1, &tex_resource, 0));
|
||||
|
||||
hipArray* image_devptr = nullptr;
|
||||
HIP_CHECK(hipGraphicsSubResourceGetMappedArray(&image_devptr, tex_resource, 0, 0));
|
||||
|
||||
REQUIRE(image_devptr != nullptr);
|
||||
|
||||
HIP_CHECK(hipGraphicsUnmapResources(1, &tex_resource, 0));
|
||||
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(tex_resource));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipGraphicsSubResourceGetMappedArray_Negative_Parameters") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
GLImageObject tex;
|
||||
|
||||
hipGraphicsResource* tex_resource;
|
||||
|
||||
HIP_CHECK(
|
||||
hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D, hipGraphicsRegisterFlagsNone));
|
||||
|
||||
HIP_CHECK(hipGraphicsMapResources(1, &tex_resource, 0));
|
||||
|
||||
hipArray* image_devptr = nullptr;
|
||||
|
||||
SECTION("array == nullptr") {
|
||||
HIP_CHECK(hipGraphicsSubResourceGetMappedArray(nullptr, tex_resource, 0, 0));
|
||||
}
|
||||
|
||||
SECTION("non-texture resource") {
|
||||
GLBufferObject vbo;
|
||||
hipGraphicsResource* vbo_resource;
|
||||
|
||||
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
|
||||
HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
|
||||
|
||||
HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, vbo_resource, 0, 0),
|
||||
hipErrorNotMappedAsArray);
|
||||
|
||||
HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
|
||||
}
|
||||
|
||||
SECTION("unregistered resource") {
|
||||
hipGraphicsResource* unregistered_resource;
|
||||
HIP_CHECK(hipGraphicsGLRegisterImage(&unregistered_resource, tex, GL_TEXTURE_2D,
|
||||
hipGraphicsRegisterFlagsNone));
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(unregistered_resource));
|
||||
HIP_CHECK_ERROR(
|
||||
hipGraphicsSubResourceGetMappedArray(&image_devptr, unregistered_resource, 0, 0),
|
||||
hipErrorContextIsDestroyed);
|
||||
}
|
||||
|
||||
SECTION("not mapped resource") {
|
||||
hipGraphicsResource* not_mapped_resource;
|
||||
HIP_CHECK(hipGraphicsGLRegisterImage(¬_mapped_resource, tex, GL_TEXTURE_2D,
|
||||
hipGraphicsRegisterFlagsNone));
|
||||
HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, not_mapped_resource, 0, 0),
|
||||
hipErrorNotMapped);
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(not_mapped_resource));
|
||||
}
|
||||
|
||||
SECTION("unmapped resource") {
|
||||
hipGraphicsResource* unmapped_resource;
|
||||
|
||||
HIP_CHECK(hipGraphicsGLRegisterImage(&unmapped_resource, tex, GL_TEXTURE_2D,
|
||||
hipGraphicsRegisterFlagsNone));
|
||||
|
||||
HIP_CHECK(hipGraphicsMapResources(1, &unmapped_resource, 0));
|
||||
HIP_CHECK(hipGraphicsUnmapResources(1, &unmapped_resource, 0));
|
||||
|
||||
HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, unmapped_resource, 0, 0),
|
||||
hipErrorNotMapped);
|
||||
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(unmapped_resource));
|
||||
}
|
||||
|
||||
SECTION("invalid arrayIndex") {
|
||||
HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, tex_resource,
|
||||
std::numeric_limits<int>::max(), 0),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("invalid mipLevel") {
|
||||
HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, tex_resource, 0,
|
||||
std::numeric_limits<int>::max()),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGraphicsUnmapResources(1, &tex_resource, 0));
|
||||
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(tex_resource));
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_gl_interop.h>
|
||||
|
||||
#include "gl_interop_common.hh"
|
||||
|
||||
TEST_CASE("Unit_hipGraphicsUnmapResources_Negative_Parameters") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
GLBufferObject vbo;
|
||||
|
||||
hipGraphicsResource* vbo_resource;
|
||||
|
||||
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
|
||||
|
||||
HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
|
||||
|
||||
SECTION("count == 0") {
|
||||
HIP_CHECK_ERROR(hipGraphicsUnmapResources(0, &vbo_resource, 0), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("resources == nullptr") {
|
||||
HIP_CHECK_ERROR(hipGraphicsUnmapResources(1, nullptr, 0), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("not mapped resource") {
|
||||
hipGraphicsResource* not_mapped_resource;
|
||||
HIP_CHECK(hipGraphicsGLRegisterBuffer(¬_mapped_resource, vbo, hipGraphicsRegisterFlagsNone));
|
||||
HIP_CHECK_ERROR(hipGraphicsUnmapResources(1, ¬_mapped_resource, 0), hipErrorNotMapped);
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(not_mapped_resource));
|
||||
}
|
||||
|
||||
SECTION("invalid stream") {
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
HIP_CHECK_ERROR(hipGraphicsUnmapResources(1, &vbo_resource, stream),
|
||||
hipErrorContextIsDestroyed);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
|
||||
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_gl_interop.h>
|
||||
|
||||
#include "gl_interop_common.hh"
|
||||
|
||||
TEST_CASE("Unit_hipGraphicsUnregisterResource_Negative_Parameters") {
|
||||
GLContextScopeGuard gl_context;
|
||||
|
||||
GLBufferObject vbo;
|
||||
|
||||
SECTION("already unregistered resource") {
|
||||
hipGraphicsResource* unregistered_resource;
|
||||
HIP_CHECK(
|
||||
hipGraphicsGLRegisterBuffer(&unregistered_resource, vbo, hipGraphicsRegisterFlagsNone));
|
||||
HIP_CHECK(hipGraphicsUnregisterResource(unregistered_resource));
|
||||
HIP_CHECK_ERROR(hipGraphicsUnregisterResource(unregistered_resource), hipErrorInvalidContext);
|
||||
}
|
||||
|
||||
SECTION("mapped resource") {
|
||||
hipGraphicsResource* mapped_resource;
|
||||
HIP_CHECK(hipGraphicsGLRegisterBuffer(&mapped_resource, vbo, hipGraphicsRegisterFlagsNone));
|
||||
HIP_CHECK(hipGraphicsMapResources(1, &mapped_resource, 0));
|
||||
HIP_CHECK_ERROR(hipGraphicsUnregisterResource(mapped_resource), hipErrorAlreadyMapped);
|
||||
}
|
||||
}
|
||||
@@ -103,6 +103,7 @@ set(TEST_SRC
|
||||
hipGraphKernelNodeSetParams.cc
|
||||
hipGraphExecKernelNodeSetParams.cc
|
||||
hipGraphLaunch.cc
|
||||
hipGraphLaunch_old.cc
|
||||
hipGraphMemcpyNodeSetParams1D.cc
|
||||
hipGraphExecMemcpyNodeSetParamsToSymbol_old.cc
|
||||
hipGraphExecMemcpyNodeSetParamsToSymbol.cc
|
||||
|
||||
@@ -40,19 +40,26 @@ end. Instantiate and Launch the Graph. Wait for the event to complete.
|
||||
Verify that hipEventElapsedTime() returns error.
|
||||
6) Validate scenario 2 by running the graph multiple times in a loop
|
||||
(100 times) after instantiation.
|
||||
7) Negative Scenarios
|
||||
7) Validate that no error is reported when numDeps <= dependencies length
|
||||
8) Negative Scenarios
|
||||
- Output node is a nullptr.
|
||||
- Input graph is a nullptr.
|
||||
- Input dependencies is a nullptr.
|
||||
- Node in dependency is from different graph
|
||||
- Invalid numNodes
|
||||
- Duplicate node in dependencies
|
||||
- Input event is a nullptr.
|
||||
- Input graph is uninitialized.
|
||||
- Input event is uninitialized.
|
||||
*/
|
||||
#include <functional>
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_kernels.hh>
|
||||
|
||||
#include "graph_tests_common.hh"
|
||||
|
||||
/**
|
||||
* Scenario 1: Create s simple graph with just one event record
|
||||
* node and instantiate and launch the graph.
|
||||
@@ -66,8 +73,7 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_Simple") {
|
||||
hipEvent_t event;
|
||||
HIP_CHECK(hipEventCreate(&event));
|
||||
hipGraphNode_t eventrec;
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
|
||||
event));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event));
|
||||
// Instantiate and launch the graph
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
|
||||
@@ -82,8 +88,8 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_Simple") {
|
||||
/**
|
||||
* Local test function
|
||||
*/
|
||||
static void validateAddEventRecordNode(bool measureTime, bool withFlags,
|
||||
int nstep, unsigned flag = 0) {
|
||||
static void validateAddEventRecordNode(bool measureTime, bool withFlags, int nstep,
|
||||
unsigned flag = 0) {
|
||||
constexpr size_t N = 1024;
|
||||
constexpr size_t Nbytes = N * sizeof(int);
|
||||
constexpr auto blocksPerCU = 6; // to hide latency
|
||||
@@ -111,8 +117,7 @@ static void validateAddEventRecordNode(bool measureTime, bool withFlags,
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
|
||||
memset(&memsetParams, 0, sizeof(memsetParams));
|
||||
memsetParams.dst = reinterpret_cast<void*>(B_d);
|
||||
memsetParams.value = 0;
|
||||
@@ -120,38 +125,34 @@ static void validateAddEventRecordNode(bool measureTime, bool withFlags,
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_B, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_B, graph, nullptr, 0, &memsetParams));
|
||||
|
||||
void* kernelArgs1[] = {&C_d, &memsetVal, reinterpret_cast<void *>(&NElem)};
|
||||
kernelNodeParams.func =
|
||||
reinterpret_cast<void *>(HipTest::memsetReverse<int>);
|
||||
void* kernelArgs1[] = {&C_d, &memsetVal, reinterpret_cast<void*>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::memsetReverse<int>);
|
||||
kernelNodeParams.gridDim = dim3(blocks);
|
||||
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
||||
kernelNodeParams.sharedMemBytes = 0;
|
||||
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs1);
|
||||
kernelNodeParams.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&memsetKer_C, graph, nullptr, 0,
|
||||
&kernelNodeParams));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&memsetKer_C, graph, nullptr, 0, &kernelNodeParams));
|
||||
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_A, graph, nullptr, 0, A_d,
|
||||
A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_A, graph, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_B, graph, nullptr, 0, B_d,
|
||||
B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_B, graph, nullptr, 0, B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_C, graph, nullptr, 0, C_h,
|
||||
C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_C, graph, nullptr, 0, C_h, C_d, Nbytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
|
||||
void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
|
||||
void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
||||
kernelNodeParams.gridDim = dim3(blocks);
|
||||
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
||||
kernelNodeParams.sharedMemBytes = 0;
|
||||
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs2);
|
||||
kernelNodeParams.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&ker_vecAdd, graph, nullptr, 0,
|
||||
&kernelNodeParams));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&ker_vecAdd, graph, nullptr, 0, &kernelNodeParams));
|
||||
hipEvent_t eventstart, eventend;
|
||||
if (withFlags) {
|
||||
HIP_CHECK(hipEventCreateWithFlags(&eventstart, flag));
|
||||
@@ -161,10 +162,8 @@ static void validateAddEventRecordNode(bool measureTime, bool withFlags,
|
||||
HIP_CHECK(hipEventCreate(&eventend));
|
||||
}
|
||||
hipGraphNode_t event_start, event_final;
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_start, graph, nullptr, 0,
|
||||
eventstart));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_final, graph, nullptr, 0,
|
||||
eventend));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_start, graph, nullptr, 0, eventstart));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_final, graph, nullptr, 0, eventend));
|
||||
// Create dependencies
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &event_start, &memset_A, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &event_start, &memset_B, 1));
|
||||
@@ -260,7 +259,7 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_TimingDisabled") {
|
||||
HIP_CHECK(hipEventCreateWithFlags(&event_start, hipEventDisableTiming));
|
||||
HIP_CHECK(hipEventCreateWithFlags(&event_end, hipEventDisableTiming));
|
||||
// memset node
|
||||
char *A_d;
|
||||
char* A_d;
|
||||
hipGraphNode_t memset_A;
|
||||
hipMemsetParams memsetParams{};
|
||||
HIP_CHECK(hipMalloc(&A_d, Nbytes));
|
||||
@@ -271,14 +270,11 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_TimingDisabled") {
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
|
||||
|
||||
hipGraphNode_t event_node_start, event_node_end;
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_node_start, graph, nullptr, 0,
|
||||
event_start));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_node_end, graph, nullptr, 0,
|
||||
event_end));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_node_start, graph, nullptr, 0, event_start));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_node_end, graph, nullptr, 0, event_end));
|
||||
// Add dependencies between nodes
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &event_node_start, &memset_A, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &memset_A, &event_node_end, 1));
|
||||
@@ -290,7 +286,7 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_TimingDisabled") {
|
||||
// Validate hipEventElapsedTime returns error code because timing is
|
||||
// disabled for start and end event nodes.
|
||||
float t;
|
||||
REQUIRE(hipSuccess != hipEventElapsedTime(&t, event_start, event_end));
|
||||
HIP_CHECK_ERROR(hipEventElapsedTime(&t, event_start, event_end), hipErrorInvalidHandle);
|
||||
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
HIP_CHECK(hipFree(A_d));
|
||||
@@ -301,44 +297,73 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_TimingDisabled") {
|
||||
}
|
||||
|
||||
/**
|
||||
* Scenario 7: All negative tests
|
||||
* Scenario 7: Positive parameter tests
|
||||
*/
|
||||
TEST_CASE("Unit_hipGraphAddEventRecordNode_Negative") {
|
||||
TEST_CASE("Unit_hipGraphAddEventRecordNode_Positive_Parameters") {
|
||||
hipGraph_t graph;
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
hipEvent_t event;
|
||||
HIP_CHECK(hipEventCreate(&event));
|
||||
hipGraphNode_t eventwait;
|
||||
SECTION("pGraphNode = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(nullptr,
|
||||
graph, nullptr, 0, event));
|
||||
hipGraphNode_t eventrec;
|
||||
|
||||
hipGraphNode_t dep_node = nullptr;
|
||||
hipGraphNode_t dep_node2 = nullptr;
|
||||
HIP_CHECK(hipGraphAddEmptyNode(&dep_node, graph, nullptr, 0));
|
||||
HIP_CHECK(hipGraphAddEmptyNode(&dep_node2, graph, nullptr, 0));
|
||||
hipGraphNode_t dep_nodes[] = {dep_node, dep_node2};
|
||||
|
||||
size_t numDeps = 0;
|
||||
SECTION("numDependencies is zero, dependencies is not nullptr") {
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, dep_nodes, 0, event));
|
||||
HIP_CHECK(hipGraphNodeGetDependencies(eventrec, nullptr, &numDeps));
|
||||
REQUIRE(numDeps == 0);
|
||||
}
|
||||
|
||||
SECTION("graph = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait,
|
||||
nullptr, nullptr, 0, event));
|
||||
SECTION("numDependencies < dependencies length") {
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, dep_nodes, 1, event));
|
||||
HIP_CHECK(hipGraphNodeGetDependencies(eventrec, nullptr, &numDeps));
|
||||
REQUIRE(numDeps == 1);
|
||||
}
|
||||
|
||||
SECTION("pDependencies = nullptr and numDependencies != 0") {
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait,
|
||||
graph, nullptr, 1, event));
|
||||
}
|
||||
|
||||
SECTION("event = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait,
|
||||
graph, nullptr, 0, nullptr));
|
||||
}
|
||||
|
||||
SECTION("graph is uninitialized") {
|
||||
hipGraph_t graph_uninit{};
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait,
|
||||
graph_uninit, nullptr, 0, nullptr));
|
||||
}
|
||||
|
||||
SECTION("event is uninitialized") {
|
||||
hipEvent_t event_uninit{};
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait,
|
||||
graph, nullptr, 0, event_uninit));
|
||||
SECTION("numDependencies == dependencies length") {
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, dep_nodes, 2, event));
|
||||
HIP_CHECK(hipGraphNodeGetDependencies(eventrec, nullptr, &numDeps));
|
||||
REQUIRE(numDeps == 2);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
HIP_CHECK(hipEventDestroy(event));
|
||||
}
|
||||
|
||||
/**
|
||||
* Scenario 8: All negative tests
|
||||
*/
|
||||
TEST_CASE("Unit_hipGraphAddEventRecordNode_Negative") {
|
||||
using namespace std::placeholders;
|
||||
hipGraph_t graph;
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
hipEvent_t event;
|
||||
HIP_CHECK(hipEventCreate(&event));
|
||||
hipGraphNode_t eventrec;
|
||||
|
||||
GraphAddNodeCommonNegativeTests(std::bind(hipGraphAddEventRecordNode, _1, _2, _3, _4, event),
|
||||
graph);
|
||||
|
||||
SECTION("event = nullptr") {
|
||||
HIP_CHECK_ERROR(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, nullptr),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("graph is uninitialized") {
|
||||
hipGraph_t graph_uninit{};
|
||||
HIP_CHECK_ERROR(hipGraphAddEventRecordNode(&eventrec, graph_uninit, nullptr, 0, event),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("event is uninitialized") {
|
||||
hipEvent_t event_uninit{};
|
||||
HIP_CHECK_ERROR(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event_uninit),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
|
||||
@@ -32,20 +32,25 @@ both graphs.
|
||||
(100 times).
|
||||
4) Execute scenario 2 with stream1 = stream2.
|
||||
5) Repeat scenario 2 for different event flags.
|
||||
6) Negative Scenarios
|
||||
6) Validate that no error is reported when numDeps <= dependencies length
|
||||
7) Negative Scenarios
|
||||
- Pass input node parameter as nullptr.
|
||||
- Pass input graph parameter as nullptr.
|
||||
- Pass input dependency parameter as nullptr.
|
||||
- Node in dependency is from different graph
|
||||
- Invalid numNodes
|
||||
- Duplicate node in dependencies
|
||||
- Pass input event parameter as nullptr.
|
||||
- Pass uninitialized input graph parameter.
|
||||
- Pass uninitialized input event parameter.
|
||||
*/
|
||||
#include <functional>
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_kernels.hh>
|
||||
|
||||
#define LEN 512
|
||||
#include "graph_tests_common.hh"
|
||||
|
||||
/**
|
||||
* Scenario 1
|
||||
@@ -60,13 +65,10 @@ TEST_CASE("Unit_hipGraphAddEventWaitNode_Functional_Simple") {
|
||||
HIP_CHECK(hipEventCreate(&event));
|
||||
hipGraphNode_t event_rec_node, event_wait_node;
|
||||
// Create a event record node in graph
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph, nullptr, 0,
|
||||
event));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph, nullptr, 0, event));
|
||||
// Create a event wait node in graph
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0,
|
||||
event));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &event_rec_node,
|
||||
&event_wait_node, 1));
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0, event));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &event_rec_node, &event_wait_node, 1));
|
||||
// Instantiate and launch the graph
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
|
||||
@@ -80,13 +82,14 @@ TEST_CASE("Unit_hipGraphAddEventWaitNode_Functional_Simple") {
|
||||
/**
|
||||
* Local Function
|
||||
*/
|
||||
static void validate_hipGraphAddEventWaitNode_internodedep(int test,
|
||||
int nstep, unsigned flag = hipEventDefault) {
|
||||
size_t memsize = LEN * sizeof(int);
|
||||
static void validate_hipGraphAddEventWaitNode_internodedep(int test, int nstep,
|
||||
unsigned flag = hipEventDefault) {
|
||||
constexpr size_t N = 1024;
|
||||
size_t memsize = N * sizeof(int);
|
||||
constexpr auto blocksPerCU = 6; // to hide latency
|
||||
constexpr auto threadsPerBlock = 256;
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, LEN);
|
||||
size_t NElem{LEN};
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
size_t NElem{N};
|
||||
hipGraph_t graph1, graph2;
|
||||
hipStream_t streamForGraph1, streamForGraph2;
|
||||
hipGraphExec_t graphExec1, graphExec2;
|
||||
@@ -114,68 +117,57 @@ static void validate_hipGraphAddEventWaitNode_internodedep(int test,
|
||||
HIP_CHECK(hipMalloc(&out_d_g1, memsize));
|
||||
HIP_CHECK(hipMalloc(&out_d_g2, memsize));
|
||||
// Initialize host buffer
|
||||
for (uint32_t i = 0; i < LEN; i++) {
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
inp_h[i] = i;
|
||||
out_h_g1[i] = 0;
|
||||
out_h_g2[i] = 0;
|
||||
}
|
||||
// Graph1 creation ...........
|
||||
// Create event1 record node in graph1
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph1, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph1, nullptr, 0, event1));
|
||||
|
||||
// Create memcpy and kernel nodes for graph1
|
||||
hipGraphNode_t memcpyH2D, memcpyD2H_1, kernelnode_1;
|
||||
hipKernelNodeParams kernelNodeParams1{};
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d,
|
||||
inp_h, memsize, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_1, graph1, nullptr, 0,
|
||||
out_h_g1, out_d_g1, memsize, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d, inp_h, memsize,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_1, graph1, nullptr, 0, out_h_g1, out_d_g1, memsize,
|
||||
hipMemcpyDeviceToHost));
|
||||
|
||||
void* kernelArgs1[] = {&inp_d, &out_d_g1, reinterpret_cast<void *>(&NElem)};
|
||||
kernelNodeParams1.func =
|
||||
reinterpret_cast<void *>(HipTest::vector_square<int>);
|
||||
void* kernelArgs1[] = {&inp_d, &out_d_g1, reinterpret_cast<void*>(&NElem)};
|
||||
kernelNodeParams1.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
|
||||
kernelNodeParams1.gridDim = dim3(blocks);
|
||||
kernelNodeParams1.blockDim = dim3(threadsPerBlock);
|
||||
kernelNodeParams1.sharedMemBytes = 0;
|
||||
kernelNodeParams1.kernelParams = reinterpret_cast<void**>(kernelArgs1);
|
||||
kernelNodeParams1.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph1, nullptr, 0,
|
||||
&kernelNodeParams1));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph1, nullptr, 0, &kernelNodeParams1));
|
||||
// Create dependencies for graph1
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D,
|
||||
&event_rec_node, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &event_rec_node,
|
||||
&kernelnode_1, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode_1,
|
||||
&memcpyD2H_1, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D, &event_rec_node, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &event_rec_node, &kernelnode_1, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode_1, &memcpyD2H_1, 1));
|
||||
|
||||
// Graph2 creation ...........
|
||||
// Create event1 record node in graph2
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0, event1));
|
||||
|
||||
// Create memcpy and kernel nodes for graph2
|
||||
hipGraphNode_t memcpyD2H_2, kernelnode_2;
|
||||
hipKernelNodeParams kernelNodeParams2{};
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_2, graph2, nullptr, 0,
|
||||
out_h_g2, out_d_g2, memsize, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_2, graph2, nullptr, 0, out_h_g2, out_d_g2, memsize,
|
||||
hipMemcpyDeviceToHost));
|
||||
|
||||
void* kernelArgs2[] = {&inp_d, &out_d_g2, reinterpret_cast<void *>(&NElem)};
|
||||
kernelNodeParams2.func =
|
||||
reinterpret_cast<void *>(HipTest::vector_cubic<int>);
|
||||
void* kernelArgs2[] = {&inp_d, &out_d_g2, reinterpret_cast<void*>(&NElem)};
|
||||
kernelNodeParams2.func = reinterpret_cast<void*>(HipTest::vector_cubic<int>);
|
||||
kernelNodeParams2.gridDim = dim3(blocks);
|
||||
kernelNodeParams2.blockDim = dim3(threadsPerBlock);
|
||||
kernelNodeParams2.sharedMemBytes = 0;
|
||||
kernelNodeParams2.kernelParams = reinterpret_cast<void**>(kernelArgs2);
|
||||
kernelNodeParams2.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph2, nullptr, 0,
|
||||
&kernelNodeParams2));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph2, nullptr, 0, &kernelNodeParams2));
|
||||
// Create dependencies for graph2
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node,
|
||||
&kernelnode_2, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &kernelnode_2,
|
||||
&memcpyD2H_2, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node, &kernelnode_2, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &kernelnode_2, &memcpyD2H_2, 1));
|
||||
|
||||
// Instantiate and launch the graphs
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
|
||||
@@ -187,16 +179,16 @@ static void validate_hipGraphAddEventWaitNode_internodedep(int test,
|
||||
HIP_CHECK(hipStreamSynchronize(streamForGraph2));
|
||||
// Validate output
|
||||
bool btestPassed1 = true;
|
||||
for (uint32_t i = 0; i < LEN; i++) {
|
||||
if (out_h_g1[i] != (inp_h[i]*inp_h[i])) {
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
if (out_h_g1[i] != (inp_h[i] * inp_h[i])) {
|
||||
btestPassed1 = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
REQUIRE(btestPassed1 == true);
|
||||
bool btestPassed2 = true;
|
||||
for (uint32_t i = 0; i < LEN; i++) {
|
||||
if (out_h_g2[i] != (inp_h[i]*inp_h[i]*inp_h[i])) {
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
if (out_h_g2[i] != (inp_h[i] * inp_h[i] * inp_h[i])) {
|
||||
btestPassed2 = false;
|
||||
break;
|
||||
}
|
||||
@@ -247,55 +239,81 @@ TEST_CASE("Unit_hipGraphAddEventWaitNode_MultGraphOneStrmDependency") {
|
||||
*/
|
||||
TEST_CASE("Unit_hipGraphAddEventWaitNode_differentFlags") {
|
||||
SECTION("flag = hipEventBlockingSync") {
|
||||
validate_hipGraphAddEventWaitNode_internodedep(0, 1,
|
||||
hipEventBlockingSync);
|
||||
validate_hipGraphAddEventWaitNode_internodedep(0, 1, hipEventBlockingSync);
|
||||
}
|
||||
SECTION("graph = hipEventDisableTiming") {
|
||||
validate_hipGraphAddEventWaitNode_internodedep(0, 1,
|
||||
hipEventDisableTiming);
|
||||
validate_hipGraphAddEventWaitNode_internodedep(0, 1, hipEventDisableTiming);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Scenario 6
|
||||
* Scenario 6: Positive parameter tests
|
||||
*/
|
||||
TEST_CASE("Unit_hipGraphAddEventWaitNode_Negative") {
|
||||
TEST_CASE("Unit_hipGraphAddEventWaitNode_Positive_Parameters") {
|
||||
hipGraph_t graph;
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
hipEvent_t event;
|
||||
HIP_CHECK(hipEventCreate(&event));
|
||||
hipGraphNode_t eventwait;
|
||||
|
||||
SECTION("pGraphNode = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(nullptr,
|
||||
graph, nullptr, 0, event));
|
||||
hipGraphNode_t dep_node = nullptr;
|
||||
hipGraphNode_t dep_node2 = nullptr;
|
||||
HIP_CHECK(hipGraphAddEmptyNode(&dep_node, graph, nullptr, 0));
|
||||
HIP_CHECK(hipGraphAddEmptyNode(&dep_node2, graph, nullptr, 0));
|
||||
hipGraphNode_t dep_nodes[] = {dep_node, dep_node2};
|
||||
|
||||
size_t numDeps = 0;
|
||||
SECTION("numDependencies is zero, dependencies is not nullptr") {
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, dep_nodes, 0, event));
|
||||
HIP_CHECK(hipGraphNodeGetDependencies(eventwait, nullptr, &numDeps));
|
||||
REQUIRE(numDeps == 0);
|
||||
}
|
||||
|
||||
SECTION("graph = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait,
|
||||
nullptr, nullptr, 0, event));
|
||||
SECTION("numDependencies < dependencies length") {
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, dep_nodes, 1, event));
|
||||
HIP_CHECK(hipGraphNodeGetDependencies(eventwait, nullptr, &numDeps));
|
||||
REQUIRE(numDeps == 1);
|
||||
}
|
||||
|
||||
SECTION("pDependencies = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait,
|
||||
graph, nullptr, 1, event));
|
||||
}
|
||||
|
||||
SECTION("event = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait,
|
||||
graph, nullptr, 0, nullptr));
|
||||
}
|
||||
|
||||
SECTION("graph is uninitialized") {
|
||||
hipGraph_t graph_uninit{};
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait,
|
||||
graph_uninit, nullptr, 0, event));
|
||||
}
|
||||
|
||||
SECTION("event is uninitialized") {
|
||||
hipEvent_t event_uninit{};
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait,
|
||||
graph, nullptr, 0, event_uninit));
|
||||
SECTION("numDependencies == dependencies length") {
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, dep_nodes, 2, event));
|
||||
HIP_CHECK(hipGraphNodeGetDependencies(eventwait, nullptr, &numDeps));
|
||||
REQUIRE(numDeps == 2);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
HIP_CHECK(hipEventDestroy(event));
|
||||
}
|
||||
|
||||
/**
|
||||
* Scenario 7
|
||||
*/
|
||||
TEST_CASE("Unit_hipGraphAddEventWaitNode_Negative") {
|
||||
using namespace std::placeholders;
|
||||
hipGraph_t graph;
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
hipEvent_t event;
|
||||
HIP_CHECK(hipEventCreate(&event));
|
||||
hipGraphNode_t eventwait;
|
||||
|
||||
GraphAddNodeCommonNegativeTests(std::bind(hipGraphAddEventWaitNode, _1, _2, _3, _4, event),
|
||||
graph);
|
||||
|
||||
SECTION("event = nullptr") {
|
||||
HIP_CHECK_ERROR(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, nullptr),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("graph is uninitialized") {
|
||||
hipGraph_t graph_uninit{};
|
||||
HIP_CHECK_ERROR(hipGraphAddEventWaitNode(&eventwait, graph_uninit, nullptr, 0, event),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("event is uninitialized") {
|
||||
hipEvent_t event_uninit{};
|
||||
HIP_CHECK_ERROR(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event_uninit),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
|
||||
@@ -26,11 +26,12 @@ with the event set in hipGraphAddEventRecordNode.
|
||||
- Output event is a nullptr.
|
||||
- Input node is an empty node.
|
||||
- Input node is a memset node.
|
||||
- Input node is event wait node
|
||||
- Input node is an uninitialized node.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_kernels.hh>
|
||||
|
||||
/**
|
||||
@@ -42,8 +43,7 @@ static void validateEventRecordNodeGetEvent(unsigned flag) {
|
||||
hipEvent_t event, event_out;
|
||||
HIP_CHECK(hipEventCreateWithFlags(&event, flag));
|
||||
hipGraphNode_t eventrec;
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
|
||||
event));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event));
|
||||
HIP_CHECK(hipGraphEventRecordNodeGetEvent(eventrec, &event_out));
|
||||
// validate set event and get event are same
|
||||
REQUIRE(event == event_out);
|
||||
@@ -77,31 +77,32 @@ TEST_CASE("Unit_hipGraphEventRecordNodeGetEvent_Functional") {
|
||||
TEST_CASE("Unit_hipGraphEventRecordNodeGetEvent_Negative") {
|
||||
hipGraph_t graph;
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
hipEvent_t event, event_out;
|
||||
HIP_CHECK(hipEventCreate(&event));
|
||||
hipGraphNode_t eventrec;
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
|
||||
event));
|
||||
hipEvent_t event_out;
|
||||
hipEvent_t event1, event2;
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
HIP_CHECK(hipEventCreate(&event2));
|
||||
hipGraphNode_t eventrec, eventwait;
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event2));
|
||||
|
||||
SECTION("node = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphEventRecordNodeGetEvent(nullptr,
|
||||
&event_out));
|
||||
HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(nullptr, &event_out), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("event_out = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphEventRecordNodeGetEvent(eventrec,
|
||||
nullptr));
|
||||
HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(eventrec, nullptr), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("input node is empty node") {
|
||||
hipGraphNode_t EmptyGraphNode;
|
||||
HIP_CHECK(hipGraphAddEmptyNode(&EmptyGraphNode, graph, nullptr, 0));
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphEventRecordNodeGetEvent(EmptyGraphNode, &event_out));
|
||||
HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(EmptyGraphNode, &event_out),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("input node is memset node") {
|
||||
constexpr size_t Nbytes = 1024;
|
||||
char *A_d;
|
||||
char* A_d;
|
||||
hipGraphNode_t memset_A;
|
||||
hipMemsetParams memsetParams{};
|
||||
HIP_CHECK(hipMalloc(&A_d, Nbytes));
|
||||
@@ -112,19 +113,21 @@ TEST_CASE("Unit_hipGraphEventRecordNodeGetEvent_Negative") {
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphEventRecordNodeGetEvent(memset_A, &event_out));
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
|
||||
HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(memset_A, &event_out), hipErrorInvalidValue);
|
||||
HIP_CHECK(hipFree(A_d));
|
||||
}
|
||||
|
||||
SECTION("input node is event wait node") {
|
||||
HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(eventwait, &event_out), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("input node is uninitialized node") {
|
||||
hipGraphNode_t node_unit{};
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphEventRecordNodeGetEvent(node_unit, &event_out));
|
||||
HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(node_unit, &event_out), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
HIP_CHECK(hipEventDestroy(event));
|
||||
HIP_CHECK(hipEventDestroy(event1));
|
||||
HIP_CHECK(hipEventDestroy(event2));
|
||||
}
|
||||
|
||||
@@ -30,14 +30,16 @@ Testcase Scenarios :
|
||||
- Input event parameter is nullptr.
|
||||
- Empty node is passed as input node.
|
||||
- Memset node is passed as input node.
|
||||
- Event wait node is passed as input node.
|
||||
- Input node is an uninitialized node.
|
||||
- Input event is an uninitialized event.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_kernels.hh>
|
||||
|
||||
|
||||
/**
|
||||
* Local Function: Set Get test
|
||||
*/
|
||||
@@ -49,8 +51,7 @@ static void validateEventRecordNodeSetEvent(unsigned flag) {
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
HIP_CHECK(hipEventCreateWithFlags(&event2, flag));
|
||||
hipGraphNode_t eventrec;
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
|
||||
// Set a different event
|
||||
HIP_CHECK(hipGraphEventRecordNodeSetEvent(eventrec, event2));
|
||||
HIP_CHECK(hipGraphEventRecordNodeGetEvent(eventrec, &event_out));
|
||||
@@ -73,11 +74,9 @@ static void setEventWaitNode() {
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
HIP_CHECK(hipEventCreate(&event2));
|
||||
hipGraphNode_t eventwait;
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1));
|
||||
// Set a different event eventwait using hipGraphEventRecordNodeSetEvent
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphEventRecordNodeSetEvent(eventwait, event2));
|
||||
HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(eventwait, event2), hipErrorInvalidValue);
|
||||
// Free resources
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
HIP_CHECK(hipEventDestroy(event1));
|
||||
@@ -98,13 +97,11 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_SetEventProperty") {
|
||||
HIP_CHECK(hipEventCreateWithFlags(&event2_end, hipEventDisableTiming));
|
||||
// Create nodes
|
||||
hipGraphNode_t event_start_rec, event_end_rec;
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0,
|
||||
event1_start));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0,
|
||||
event1_end));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0, event1_start));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0, event1_end));
|
||||
// Create memset node
|
||||
constexpr size_t Nbytes = 1024;
|
||||
char *A_d;
|
||||
char* A_d;
|
||||
hipGraphNode_t memset_A;
|
||||
hipMemsetParams memsetParams{};
|
||||
HIP_CHECK(hipMalloc(&A_d, Nbytes));
|
||||
@@ -115,8 +112,7 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_SetEventProperty") {
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
|
||||
// Create dependencies
|
||||
// event_start_rec --> memset_A --> event_end_rec
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec, &memset_A, 1));
|
||||
@@ -132,8 +128,7 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_SetEventProperty") {
|
||||
// Validate by measuring time difference between event_end_rec &
|
||||
// event_start_rec
|
||||
float t = 0.0f;
|
||||
REQUIRE(hipSuccess == hipEventElapsedTime(&t, event1_start,
|
||||
event1_end));
|
||||
REQUIRE(hipSuccess == hipEventElapsedTime(&t, event1_start, event1_end));
|
||||
REQUIRE(t > 0.0f);
|
||||
// Change the event property after instantiation
|
||||
HIP_CHECK(hipGraphEventRecordNodeSetEvent(event_start_rec, event2_start));
|
||||
@@ -145,8 +140,7 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_SetEventProperty") {
|
||||
// hipErrorInvalidHandle when events are created using
|
||||
// hipEventDisableTiming flag.
|
||||
t = 0.0f;
|
||||
REQUIRE(hipErrorInvalidHandle ==
|
||||
hipEventElapsedTime(&t, event2_start, event2_end));
|
||||
HIP_CHECK_ERROR(hipEventElapsedTime(&t, event2_start, event2_end), hipErrorInvalidHandle);
|
||||
// Free resources
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
HIP_CHECK(hipStreamDestroy(streamForGraph));
|
||||
@@ -185,28 +179,24 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_Negative") {
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
HIP_CHECK(hipEventCreate(&event2));
|
||||
hipGraphNode_t eventrec;
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
|
||||
SECTION("node = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphEventRecordNodeSetEvent(nullptr,
|
||||
event2));
|
||||
HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(nullptr, event2), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("event_out = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphEventRecordNodeSetEvent(eventrec,
|
||||
nullptr));
|
||||
HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(eventrec, nullptr), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("input node is empty node") {
|
||||
hipGraphNode_t EmptyGraphNode;
|
||||
HIP_CHECK(hipGraphAddEmptyNode(&EmptyGraphNode, graph, nullptr, 0));
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphEventRecordNodeSetEvent(EmptyGraphNode, event2));
|
||||
HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(EmptyGraphNode, event2), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("input node is memset node") {
|
||||
constexpr size_t Nbytes = 1024;
|
||||
char *A_d;
|
||||
char* A_d;
|
||||
hipGraphNode_t memset_A;
|
||||
hipMemsetParams memsetParams{};
|
||||
HIP_CHECK(hipMalloc(&A_d, Nbytes));
|
||||
@@ -217,10 +207,8 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_Negative") {
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphEventRecordNodeSetEvent(memset_A, event2));
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
|
||||
HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(memset_A, event2), hipErrorInvalidValue);
|
||||
HIP_CHECK(hipFree(A_d));
|
||||
}
|
||||
|
||||
@@ -230,14 +218,12 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_Negative") {
|
||||
|
||||
SECTION("input node is uninitialized node") {
|
||||
hipGraphNode_t node_uninit{};
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphEventRecordNodeSetEvent(node_uninit, event2));
|
||||
HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(node_uninit, event2), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("input event is uninitialized") {
|
||||
hipEvent_t event_uninit{};
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphEventRecordNodeSetEvent(eventrec, event_uninit));
|
||||
HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(eventrec, event_uninit), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
|
||||
@@ -26,13 +26,15 @@ with the event set in hipGraphAddEventWaitNode.
|
||||
- Output event parameter is passed as nullptr.
|
||||
- Input node parameter is an empty node.
|
||||
- Input node parameter is a memset node.
|
||||
- Input node parameter is a event record node.
|
||||
- Input node parameter is an uninitialized node.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_kernels.hh>
|
||||
|
||||
|
||||
/**
|
||||
* Local Function
|
||||
*/
|
||||
@@ -42,8 +44,7 @@ static void validateEventWaitNodeGetEvent(unsigned flag) {
|
||||
hipEvent_t event, event_out;
|
||||
HIP_CHECK(hipEventCreateWithFlags(&event, flag));
|
||||
hipGraphNode_t eventwait;
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
|
||||
event));
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event));
|
||||
HIP_CHECK(hipGraphEventWaitNodeGetEvent(eventwait, &event_out));
|
||||
// validate set event and get event are same
|
||||
REQUIRE(event == event_out);
|
||||
@@ -77,31 +78,32 @@ TEST_CASE("Unit_hipGraphEventWaitNodeGetEvent_Functional") {
|
||||
TEST_CASE("Unit_hipGraphEventWaitNodeGetEvent_Negative") {
|
||||
hipGraph_t graph;
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
hipEvent_t event, event_out;
|
||||
HIP_CHECK(hipEventCreate(&event));
|
||||
hipGraphNode_t eventwait;
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
|
||||
event));
|
||||
hipEvent_t event_out;
|
||||
hipEvent_t event1, event2;
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
HIP_CHECK(hipEventCreate(&event2));
|
||||
hipGraphNode_t eventrec, eventwait;
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event2));
|
||||
|
||||
SECTION("node = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeGetEvent(nullptr,
|
||||
&event_out));
|
||||
HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(nullptr, &event_out), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("event_out = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeGetEvent(eventwait,
|
||||
nullptr));
|
||||
HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(eventwait, nullptr), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("input node is empty node") {
|
||||
hipGraphNode_t EmptyGraphNode;
|
||||
HIP_CHECK(hipGraphAddEmptyNode(&EmptyGraphNode, graph, nullptr, 0));
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphEventWaitNodeGetEvent(EmptyGraphNode, &event_out));
|
||||
HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(EmptyGraphNode, &event_out),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("input node is memset node") {
|
||||
constexpr size_t Nbytes = 1024;
|
||||
char *A_d;
|
||||
char* A_d;
|
||||
hipGraphNode_t memset_A;
|
||||
hipMemsetParams memsetParams{};
|
||||
HIP_CHECK(hipMalloc(&A_d, Nbytes));
|
||||
@@ -112,19 +114,21 @@ TEST_CASE("Unit_hipGraphEventWaitNodeGetEvent_Negative") {
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphEventWaitNodeGetEvent(memset_A, &event_out));
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
|
||||
HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(memset_A, &event_out), hipErrorInvalidValue);
|
||||
HIP_CHECK(hipFree(A_d));
|
||||
}
|
||||
|
||||
SECTION("input node is event record node") {
|
||||
HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(eventrec, &event_out), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("input node is uninitialized") {
|
||||
hipGraphNode_t node_uninit{};
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphEventWaitNodeGetEvent(node_uninit, &event_out));
|
||||
HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(node_uninit, &event_out), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
HIP_CHECK(hipEventDestroy(event));
|
||||
HIP_CHECK(hipEventDestroy(event1));
|
||||
HIP_CHECK(hipEventDestroy(event2));
|
||||
}
|
||||
|
||||
@@ -37,11 +37,10 @@ Testcase Scenarios :
|
||||
- Input event is an uninitialized node.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_kernels.hh>
|
||||
|
||||
#define LEN 512
|
||||
|
||||
/**
|
||||
* Local Function
|
||||
@@ -54,8 +53,7 @@ static void validateEventWaitNodeSetEvent(unsigned flag) {
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
HIP_CHECK(hipEventCreateWithFlags(&event2, flag));
|
||||
hipGraphNode_t eventwait;
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1));
|
||||
// Set a different event
|
||||
HIP_CHECK(hipGraphEventWaitNodeSetEvent(eventwait, event2));
|
||||
HIP_CHECK(hipGraphEventWaitNodeGetEvent(eventwait, &event_out));
|
||||
@@ -78,11 +76,9 @@ static void setEventRecordNode() {
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
HIP_CHECK(hipEventCreate(&event2));
|
||||
hipGraphNode_t eventrec;
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
|
||||
// Set a different event eventrec using hipGraphEventWaitNodeSetEvent
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphEventWaitNodeSetEvent(eventrec, event2));
|
||||
HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(eventrec, event2), hipErrorInvalidValue);
|
||||
// Free resources
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
HIP_CHECK(hipEventDestroy(event1));
|
||||
@@ -93,11 +89,12 @@ static void setEventRecordNode() {
|
||||
* Scenario 2
|
||||
*/
|
||||
TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_SetProp") {
|
||||
size_t memsize = LEN * sizeof(int);
|
||||
constexpr size_t N = 512;
|
||||
size_t memsize = N * sizeof(int);
|
||||
constexpr auto blocksPerCU = 6; // to hide latency
|
||||
constexpr auto threadsPerBlock = 256;
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, LEN);
|
||||
size_t NElem{LEN};
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
size_t NElem{N};
|
||||
hipGraph_t graph1, graph2;
|
||||
hipStream_t streamForGraph1, streamForGraph2;
|
||||
hipGraphExec_t graphExec1, graphExec2;
|
||||
@@ -123,67 +120,56 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_SetProp") {
|
||||
HIP_CHECK(hipMalloc(&out_d_g1, memsize));
|
||||
HIP_CHECK(hipMalloc(&out_d_g2, memsize));
|
||||
// Initialize host buffer
|
||||
for (uint32_t i = 0; i < LEN; i++) {
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
inp_h[i] = i;
|
||||
out_h_g1[i] = 0;
|
||||
out_h_g2[i] = 0;
|
||||
}
|
||||
// Graph1 creation ...........
|
||||
// Create event1 record node in graph1
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph1, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph1, nullptr, 0, event1));
|
||||
|
||||
// Create memcpy and kernel nodes for graph1
|
||||
hipGraphNode_t memcpyH2D, memcpyD2H_1, kernelnode_1;
|
||||
hipKernelNodeParams kernelNodeParams1{};
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d,
|
||||
inp_h, memsize, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_1, graph1, nullptr, 0,
|
||||
out_h_g1, out_d_g1, memsize, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d, inp_h, memsize,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_1, graph1, nullptr, 0, out_h_g1, out_d_g1, memsize,
|
||||
hipMemcpyDeviceToHost));
|
||||
|
||||
void* kernelArgs1[] = {&inp_d, &out_d_g1, reinterpret_cast<void *>(&NElem)};
|
||||
kernelNodeParams1.func =
|
||||
reinterpret_cast<void *>(HipTest::vector_square<int>);
|
||||
void* kernelArgs1[] = {&inp_d, &out_d_g1, reinterpret_cast<void*>(&NElem)};
|
||||
kernelNodeParams1.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
|
||||
kernelNodeParams1.gridDim = dim3(blocks);
|
||||
kernelNodeParams1.blockDim = dim3(threadsPerBlock);
|
||||
kernelNodeParams1.sharedMemBytes = 0;
|
||||
kernelNodeParams1.kernelParams = reinterpret_cast<void**>(kernelArgs1);
|
||||
kernelNodeParams1.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph1, nullptr, 0,
|
||||
&kernelNodeParams1));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph1, nullptr, 0, &kernelNodeParams1));
|
||||
// Create dependencies for graph1
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D,
|
||||
&event_rec_node, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &event_rec_node,
|
||||
&kernelnode_1, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode_1,
|
||||
&memcpyD2H_1, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D, &event_rec_node, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &event_rec_node, &kernelnode_1, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode_1, &memcpyD2H_1, 1));
|
||||
|
||||
// Graph2 creation ...........
|
||||
// Create event1 record node in graph2
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0, event1));
|
||||
// Create memcpy and kernel nodes for graph2
|
||||
hipGraphNode_t memcpyD2H_2, kernelnode_2;
|
||||
hipKernelNodeParams kernelNodeParams2{};
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_2, graph2, nullptr, 0,
|
||||
out_h_g2, out_d_g2, memsize, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_2, graph2, nullptr, 0, out_h_g2, out_d_g2, memsize,
|
||||
hipMemcpyDeviceToHost));
|
||||
|
||||
void* kernelArgs2[] = {&inp_d, &out_d_g2, reinterpret_cast<void *>(&NElem)};
|
||||
kernelNodeParams2.func =
|
||||
reinterpret_cast<void *>(HipTest::vector_cubic<int>);
|
||||
void* kernelArgs2[] = {&inp_d, &out_d_g2, reinterpret_cast<void*>(&NElem)};
|
||||
kernelNodeParams2.func = reinterpret_cast<void*>(HipTest::vector_cubic<int>);
|
||||
kernelNodeParams2.gridDim = dim3(blocks);
|
||||
kernelNodeParams2.blockDim = dim3(threadsPerBlock);
|
||||
kernelNodeParams2.sharedMemBytes = 0;
|
||||
kernelNodeParams2.kernelParams = reinterpret_cast<void**>(kernelArgs2);
|
||||
kernelNodeParams2.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph2, nullptr, 0,
|
||||
&kernelNodeParams2));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph2, nullptr, 0, &kernelNodeParams2));
|
||||
// Create dependencies for graph2
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node,
|
||||
&kernelnode_2, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &kernelnode_2,
|
||||
&memcpyD2H_2, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node, &kernelnode_2, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &kernelnode_2, &memcpyD2H_2, 1));
|
||||
|
||||
// Instantiate and launch the graphs
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
|
||||
@@ -198,16 +184,16 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_SetProp") {
|
||||
HIP_CHECK(hipStreamSynchronize(streamForGraph2));
|
||||
// Validate output
|
||||
bool btestPassed1 = true;
|
||||
for (uint32_t i = 0; i < LEN; i++) {
|
||||
if (out_h_g1[i] != (inp_h[i]*inp_h[i])) {
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
if (out_h_g1[i] != (inp_h[i] * inp_h[i])) {
|
||||
btestPassed1 = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
REQUIRE(btestPassed1 == true);
|
||||
bool btestPassed2 = true;
|
||||
for (uint32_t i = 0; i < LEN; i++) {
|
||||
if (out_h_g2[i] != (inp_h[i]*inp_h[i]*inp_h[i])) {
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
if (out_h_g2[i] != (inp_h[i] * inp_h[i] * inp_h[i])) {
|
||||
btestPassed2 = false;
|
||||
break;
|
||||
}
|
||||
@@ -256,28 +242,24 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_Negative") {
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
HIP_CHECK(hipEventCreate(&event2));
|
||||
hipGraphNode_t eventwait;
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1));
|
||||
SECTION("node = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeSetEvent(
|
||||
nullptr, event2));
|
||||
HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(nullptr, event2), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("event = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeSetEvent(
|
||||
eventwait, nullptr));
|
||||
HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(eventwait, nullptr), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("input node is empty node") {
|
||||
hipGraphNode_t EmptyGraphNode;
|
||||
HIP_CHECK(hipGraphAddEmptyNode(&EmptyGraphNode, graph, nullptr, 0));
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphEventWaitNodeSetEvent(EmptyGraphNode, event2));
|
||||
HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(EmptyGraphNode, event2), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("input node is memset node") {
|
||||
constexpr size_t Nbytes = 1024;
|
||||
char *A_d;
|
||||
char* A_d;
|
||||
hipGraphNode_t memset_A;
|
||||
hipMemsetParams memsetParams{};
|
||||
HIP_CHECK(hipMalloc(&A_d, Nbytes));
|
||||
@@ -288,10 +270,8 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_Negative") {
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphEventWaitNodeSetEvent(memset_A, event2));
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
|
||||
HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(memset_A, event2), hipErrorInvalidValue);
|
||||
HIP_CHECK(hipFree(A_d));
|
||||
}
|
||||
|
||||
@@ -301,14 +281,12 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_Negative") {
|
||||
|
||||
SECTION("input node is uninitialized node") {
|
||||
hipGraphNode_t node_uninit{};
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphEventWaitNodeSetEvent(node_uninit, event2));
|
||||
HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(node_uninit, event2), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("input event is uninitialized") {
|
||||
hipEvent_t event_uninit{};
|
||||
REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeSetEvent(
|
||||
eventwait, event_uninit));
|
||||
HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(eventwait, event_uninit), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
|
||||
@@ -20,26 +20,51 @@ THE SOFTWARE.
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
/**
|
||||
Negative Testcase Scenarios :
|
||||
1) Pass hipGraphExecDestroy with nullptr.
|
||||
2) Pass hipGraphExecDestroy with un-initilze structure.
|
||||
3) Destroy graph before exec-graph destroyed and verify no crash occurs.
|
||||
*/
|
||||
* @addtogroup hipGraphExecDestroy hipGraphExecDestroy
|
||||
* @{
|
||||
* @ingroup GraphTest
|
||||
* `hipGraphExecDestroy(hipGraphExec_t graphExec)` -
|
||||
* Destroys an executable graph
|
||||
*/
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Test to verify API behavior with invalid arguments:
|
||||
* -# GraphExec is nullptr
|
||||
* -# GraphExec is uninitialized
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/graph/hipGraphExecDestroy.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Unit_hipGraphExecDestroy_Negative_Parameters") {
|
||||
|
||||
TEST_CASE("Unit_hipGraphExecDestroy_Negative") {
|
||||
hipError_t ret;
|
||||
SECTION("Pass hipGraphExecDestroy with nullptr") {
|
||||
ret = hipGraphExecDestroy(nullptr);
|
||||
REQUIRE(hipErrorInvalidValue == ret);
|
||||
HIP_CHECK_ERROR(hipGraphExecDestroy(nullptr), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("Pass hipGraphExecDestroy with un-initilze structure") {
|
||||
hipGraphExec_t graphExec{};
|
||||
ret = hipGraphExecDestroy(graphExec);
|
||||
REQUIRE(hipErrorInvalidValue == ret);
|
||||
hipGraphExec_t graph_exec{};
|
||||
HIP_CHECK_ERROR(hipGraphExecDestroy(graph_exec), hipErrorInvalidValue);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipGraphExecDestroy_Sequence") {
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Basic positive test for hipGraphExecDestroy
|
||||
* - create an executable graph and then destroy it
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/graph/hipGraphExecDestroy.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Unit_hipGraphExecDestroy_Positive_Basic") {
|
||||
hipGraph_t graph;
|
||||
hipGraphExec_t graphExec;
|
||||
hipStream_t streamForGraph;
|
||||
@@ -70,4 +95,3 @@ TEST_CASE("Unit_hipGraphExecDestroy_Sequence") {
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
HIP_CHECK(hipStreamDestroy(streamForGraph));
|
||||
}
|
||||
|
||||
|
||||
@@ -33,7 +33,12 @@ Testcase Scenarios :
|
||||
the graph to create an executable graph. Change the event in the
|
||||
executable graph to event2. Verify that the event record node still
|
||||
contains event1.
|
||||
3) Negative Scenarios
|
||||
3) Scenario to verify that hipGraphExecEventRecordNodeSetEvent can set event
|
||||
created on different device. Create an event record node with event1 and add it to graph.
|
||||
Instantiate the graph to create an executable graph. Call the API to change the event in the
|
||||
executable graph to event2 which has been created on different device. Verify that graph can be
|
||||
launched and no error is reported.
|
||||
4) Negative Scenarios
|
||||
- Input executable graph is a nullptr.
|
||||
- Input node is a nullptr.
|
||||
- Input event to set is a nullptr.
|
||||
@@ -45,27 +50,26 @@ Testcase Scenarios :
|
||||
- Input node is a event wait node.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_kernels.hh>
|
||||
|
||||
#define GRID_DIM 512
|
||||
#define BLK_DIM 512
|
||||
#define LEN (GRID_DIM * BLK_DIM)
|
||||
|
||||
/**
|
||||
* Kernel Functions to copy.
|
||||
*/
|
||||
static __global__ void copy_ker_func(int* a, int* b) {
|
||||
int tx = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
if (tx < LEN) b[tx] = a[tx];
|
||||
static __global__ void copy_ker_func(int* a, int* b, size_t N) {
|
||||
int tx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tx < N) b[tx] = a[tx];
|
||||
}
|
||||
|
||||
/**
|
||||
* Scenario 1: Functional scenario (See description Above)
|
||||
*/
|
||||
TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") {
|
||||
size_t memsize = LEN*sizeof(int);
|
||||
constexpr size_t gridSize = 512;
|
||||
constexpr size_t blockSize = 512;
|
||||
constexpr size_t N = gridSize * blockSize;
|
||||
size_t memsize = N * sizeof(int);
|
||||
hipGraph_t graph;
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
// Create events
|
||||
@@ -75,10 +79,8 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") {
|
||||
HIP_CHECK(hipEventCreate(&event2_end));
|
||||
// Create nodes with event_start and event1_end
|
||||
hipGraphNode_t event_start_rec, event_end_rec;
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0,
|
||||
event_start));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0,
|
||||
event1_end));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0, event_start));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0, event1_end));
|
||||
int *inp_h, *inp_d, *out_h, *out_d;
|
||||
// Allocate host buffers
|
||||
inp_h = reinterpret_cast<int*>(malloc(memsize));
|
||||
@@ -89,7 +91,7 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") {
|
||||
HIP_CHECK(hipMalloc(&inp_d, memsize));
|
||||
HIP_CHECK(hipMalloc(&out_d, memsize));
|
||||
// Initialize host buffer
|
||||
for (uint32_t i = 0; i < LEN; i++) {
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
inp_h[i] = i;
|
||||
out_h[i] = 0;
|
||||
}
|
||||
@@ -97,44 +99,39 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") {
|
||||
// Create memcpy and kernel nodes for graph
|
||||
hipGraphNode_t memcpyH2D, memcpyD2H, kernelnode;
|
||||
hipKernelNodeParams kernelNodeParams{};
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr, 0, inp_d,
|
||||
inp_h, memsize, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nullptr, 0,
|
||||
out_h, out_d, memsize, hipMemcpyDeviceToHost));
|
||||
void* kernelArgs1[] = {&inp_d, &out_d};
|
||||
kernelNodeParams.func = reinterpret_cast<void *>(copy_ker_func);
|
||||
kernelNodeParams.gridDim = dim3(GRID_DIM);
|
||||
kernelNodeParams.blockDim = dim3(BLK_DIM);
|
||||
size_t NElem{N};
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr, 0, inp_d, inp_h, memsize,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nullptr, 0, out_h, out_d, memsize,
|
||||
hipMemcpyDeviceToHost));
|
||||
void* kernelArgs1[] = {&inp_d, &out_d, reinterpret_cast<void*>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void*>(copy_ker_func);
|
||||
kernelNodeParams.gridDim = dim3(gridSize);
|
||||
kernelNodeParams.blockDim = dim3(blockSize);
|
||||
kernelNodeParams.sharedMemBytes = 0;
|
||||
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs1);
|
||||
kernelNodeParams.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernelnode, graph, nullptr, 0,
|
||||
&kernelNodeParams));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernelnode, graph, nullptr, 0, &kernelNodeParams));
|
||||
|
||||
// Create dependencies for graph
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec,
|
||||
&memcpyH2D, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &memcpyH2D,
|
||||
&kernelnode, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &kernelnode,
|
||||
&memcpyD2H, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &memcpyD2H,
|
||||
&event_end_rec, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec, &memcpyH2D, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &memcpyH2D, &kernelnode, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &kernelnode, &memcpyD2H, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &memcpyD2H, &event_end_rec, 1));
|
||||
// Instantiate and launch the graph
|
||||
hipStream_t streamForGraph;
|
||||
hipGraphExec_t graphExec;
|
||||
HIP_CHECK(hipStreamCreate(&streamForGraph));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
// Change the event at event_end_rec node to event2_end
|
||||
HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec,
|
||||
event_end_rec, event2_end));
|
||||
HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, event_end_rec, event2_end));
|
||||
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
|
||||
// Wait for graph to complete
|
||||
HIP_CHECK(hipStreamSynchronize(streamForGraph));
|
||||
// Validate output
|
||||
bool btestPassed = true;
|
||||
for (uint32_t i = 0; i < LEN; i++) {
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
if (out_h[i] != inp_h[i]) {
|
||||
btestPassed = false;
|
||||
break;
|
||||
@@ -147,8 +144,7 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") {
|
||||
REQUIRE(t > 0.0f);
|
||||
// Since event1_end is never recorded, hipEventElapsedTime
|
||||
// should return error code.
|
||||
REQUIRE(hipErrorInvalidResourceHandle ==
|
||||
hipEventElapsedTime(&t, event_start, event1_end));
|
||||
HIP_CHECK_ERROR(hipEventElapsedTime(&t, event_start, event1_end), hipErrorInvalidResourceHandle);
|
||||
// Free resources
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
HIP_CHECK(hipStreamDestroy(streamForGraph));
|
||||
@@ -173,12 +169,10 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_VerifyEventNotChanged") {
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
HIP_CHECK(hipEventCreate(&event2));
|
||||
hipGraphNode_t eventrec;
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
|
||||
hipGraphExec_t graphExec;
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec,
|
||||
eventrec, event2));
|
||||
HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, event2));
|
||||
HIP_CHECK(hipGraphEventRecordNodeGetEvent(eventrec, &event_out));
|
||||
// validate set event and get event are same
|
||||
REQUIRE(event1 == event_out);
|
||||
@@ -190,7 +184,48 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_VerifyEventNotChanged") {
|
||||
}
|
||||
|
||||
/**
|
||||
* Scenario 3: Negative Tests
|
||||
* Scenario 3: This test verifies event in node of the executable graph can be changed to event on
|
||||
* different device
|
||||
*/
|
||||
TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Positive_DifferentDevices") {
|
||||
const auto device_count = HipTest::getDeviceCount();
|
||||
if (device_count < 2) {
|
||||
HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
|
||||
return;
|
||||
}
|
||||
hipGraphExec_t graphExec;
|
||||
hipStream_t streamForGraph;
|
||||
hipGraph_t graph;
|
||||
hipEvent_t event1, event2;
|
||||
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
HIP_CHECK(hipSetDevice(1));
|
||||
HIP_CHECK(hipEventCreate(&event2));
|
||||
|
||||
HIP_CHECK(hipSetDevice(0));
|
||||
hipGraphNode_t eventrec;
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
|
||||
|
||||
// Verify event on different device can be set in graphExec
|
||||
// Instantiate and launch the graph
|
||||
HIP_CHECK(hipStreamCreate(&streamForGraph));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, event2));
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
|
||||
// Wait for graph to complete
|
||||
HIP_CHECK(hipStreamSynchronize(streamForGraph));
|
||||
// Free resources
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
HIP_CHECK(hipStreamDestroy(streamForGraph));
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
HIP_CHECK(hipEventDestroy(event2));
|
||||
HIP_CHECK(hipEventDestroy(event1))
|
||||
}
|
||||
|
||||
/**
|
||||
* Scenario 4: Negative Parameter Tests
|
||||
*/
|
||||
TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Negative") {
|
||||
hipGraph_t graph;
|
||||
@@ -199,11 +234,10 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Negative") {
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
HIP_CHECK(hipEventCreate(&event2));
|
||||
hipGraphNode_t eventrec;
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
|
||||
// Create memset
|
||||
constexpr size_t Nbytes = 1024;
|
||||
char *A_d;
|
||||
char* A_d;
|
||||
hipGraphNode_t memset_A;
|
||||
hipMemsetParams memsetParams{};
|
||||
HIP_CHECK(hipMalloc(&A_d, Nbytes));
|
||||
@@ -219,66 +253,61 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Negative") {
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
|
||||
SECTION("hGraphExec = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventRecordNodeSetEvent(nullptr, eventrec, event2));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(nullptr, eventrec, event2),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("hNode = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventRecordNodeSetEvent(graphExec, nullptr, event2));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, nullptr, event2),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("event = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, nullptr));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, nullptr),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("hGraphExec is uninitialized") {
|
||||
hipGraphExec_t graphExec1{};
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventRecordNodeSetEvent(graphExec1, eventrec, event2));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec1, eventrec, event2),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("hNode is uninitialized") {
|
||||
hipGraphNode_t dummy{};
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventRecordNodeSetEvent(graphExec, dummy, event2));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, dummy, event2),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("event is uninitialized") {
|
||||
hipEvent_t event_dummy{};
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec,
|
||||
event_dummy));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, event_dummy),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("event record node does not exist") {
|
||||
hipGraph_t graph1;
|
||||
HIP_CHECK(hipGraphCreate(&graph1, 0));
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph1, nullptr, 0,
|
||||
&memsetParams));
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph1, nullptr, 0, &memsetParams));
|
||||
hipGraphExec_t graphExec1;
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventRecordNodeSetEvent(graphExec1, eventrec, event2));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec1, eventrec, event2),
|
||||
hipErrorInvalidValue);
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec1));
|
||||
HIP_CHECK(hipGraphDestroy(graph1));
|
||||
}
|
||||
|
||||
SECTION("pass memset node as hNode") {
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventRecordNodeSetEvent(graphExec, memset_A, event2));
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, memset_A, event2),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("pass event wait node as hNode") {
|
||||
hipGraphNode_t event_wait_node;
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0,
|
||||
event1));
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventRecordNodeSetEvent(graphExec, event_wait_node,
|
||||
event2));
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0, event1));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, event_wait_node, event2),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipFree(A_d));
|
||||
|
||||
@@ -47,33 +47,30 @@ Testcase Scenarios :
|
||||
- Pass event record node as input node.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_kernels.hh>
|
||||
|
||||
#define GRID_DIM 64
|
||||
#define BLK_DIM 256
|
||||
#define LEN (GRID_DIM * BLK_DIM)
|
||||
#define DELAY_IN_MS 2000
|
||||
|
||||
/**
|
||||
* Kernel Functions to perform square and introduce delay in device.
|
||||
*/
|
||||
static __global__ void sqr_ker_func(int* a, int* b, int clockrate) {
|
||||
int tx = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
|
||||
if (tx < LEN) b[tx] = a[tx]*a[tx];
|
||||
uint64_t wait_t = DELAY_IN_MS,
|
||||
start = clock64()/clockrate, cur;
|
||||
do { cur = clock64()/clockrate - start;}while (cur < wait_t);
|
||||
static __global__ void sqr_ker_func(int* a, int* b, size_t N, int clockrate, size_t delayMs) {
|
||||
int tx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
|
||||
if (tx < N) b[tx] = a[tx] * a[tx];
|
||||
uint64_t wait_t = delayMs, start = clock64() / clockrate, cur;
|
||||
do {
|
||||
cur = clock64() / clockrate - start;
|
||||
} while (cur < wait_t);
|
||||
}
|
||||
|
||||
static __global__ void sqr_ker_func_gfx11(int* a, int* b, int clockrate) {
|
||||
static __global__ void sqr_ker_func_gfx11(int* a, int* b, size_t N, int clockrate, size_t delayMs) {
|
||||
#if HT_AMD
|
||||
int tx = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
|
||||
if (tx < LEN) b[tx] = a[tx]*a[tx];
|
||||
uint64_t wait_t = DELAY_IN_MS,
|
||||
start = wall_clock64()/clockrate, cur;
|
||||
do { cur = wall_clock64()/clockrate - start;}while (cur < wait_t);
|
||||
int tx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
|
||||
if (tx < N) b[tx] = a[tx] * a[tx];
|
||||
uint64_t wait_t = delayMs, start = wall_clock64() / clockrate, cur;
|
||||
do {
|
||||
cur = wall_clock64() / clockrate - start;
|
||||
} while (cur < wait_t);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -81,7 +78,10 @@ static __global__ void sqr_ker_func_gfx11(int* a, int* b, int clockrate) {
|
||||
* Scenario 1: Test to validate setting different events in executable graph.
|
||||
*/
|
||||
TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
|
||||
size_t memsize = LEN*sizeof(int);
|
||||
constexpr size_t gridSize = 64;
|
||||
constexpr size_t blockSize = 256;
|
||||
constexpr size_t N = gridSize * blockSize;
|
||||
size_t memsize = N * sizeof(int);
|
||||
hipGraph_t graph1, graph2;
|
||||
HIP_CHECK(hipGraphCreate(&graph1, 0));
|
||||
HIP_CHECK(hipGraphCreate(&graph2, 0));
|
||||
@@ -91,8 +91,7 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
|
||||
HIP_CHECK(hipEventCreate(&event2));
|
||||
// Create nodes with event_start and event1_end
|
||||
hipGraphNode_t event_rec;
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_rec, graph1, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&event_rec, graph1, nullptr, 0, event1));
|
||||
int *inp_h, *inp_d, *out_h, *out_d;
|
||||
// Allocate host buffers
|
||||
inp_h = reinterpret_cast<int*>(malloc(memsize));
|
||||
@@ -103,7 +102,7 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
|
||||
HIP_CHECK(hipMalloc(&inp_d, memsize));
|
||||
HIP_CHECK(hipMalloc(&out_d, memsize));
|
||||
// Initialize host buffer
|
||||
for (uint32_t i = 0; i < LEN; i++) {
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
inp_h[i] = i;
|
||||
out_h[i] = 0;
|
||||
}
|
||||
@@ -112,10 +111,12 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
|
||||
// MemcpyH2D -> kernel1 -> event_rec
|
||||
hipGraphNode_t memcpyH2D, kernelnode1;
|
||||
hipKernelNodeParams kernelNodeParams1{};
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d,
|
||||
inp_h, memsize, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d, inp_h, memsize,
|
||||
hipMemcpyHostToDevice));
|
||||
// Get device clock rate
|
||||
int clkRate = 0;
|
||||
size_t NElem{N};
|
||||
size_t delayMs{2000};
|
||||
if (IsGfx11()) {
|
||||
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
|
||||
} else {
|
||||
@@ -123,29 +124,25 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
|
||||
}
|
||||
// kernel1
|
||||
auto sqr_ker_func_used = IsGfx11() ? sqr_ker_func_gfx11 : sqr_ker_func;
|
||||
void* kernelArgs[] = {&inp_d, &out_d, reinterpret_cast<void *>(&clkRate)};
|
||||
kernelNodeParams1.func = reinterpret_cast<void *>(sqr_ker_func_used);
|
||||
kernelNodeParams1.gridDim = dim3(GRID_DIM);
|
||||
kernelNodeParams1.blockDim = dim3(BLK_DIM);
|
||||
void* kernelArgs[] = {&inp_d, &out_d, reinterpret_cast<void*>(&NElem),
|
||||
reinterpret_cast<void*>(&clkRate), reinterpret_cast<void*>(&delayMs)};
|
||||
kernelNodeParams1.func = reinterpret_cast<void*>(sqr_ker_func_used);
|
||||
kernelNodeParams1.gridDim = dim3(gridSize);
|
||||
kernelNodeParams1.blockDim = dim3(blockSize);
|
||||
kernelNodeParams1.sharedMemBytes = 0;
|
||||
kernelNodeParams1.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
||||
kernelNodeParams1.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernelnode1, graph1, nullptr, 0,
|
||||
&kernelNodeParams1));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernelnode1, graph1, nullptr, 0, &kernelNodeParams1));
|
||||
// Create dependencies for graph1
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D,
|
||||
&kernelnode1, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode1,
|
||||
&event_rec, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D, &kernelnode1, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode1, &event_rec, 1));
|
||||
// graph2 creation ...........
|
||||
// waitnode(event1) -> MemcpyD2H
|
||||
hipGraphNode_t event_wait_node, memcpyD2H;
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph2, nullptr, 0,
|
||||
out_h, out_d, memsize, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node,
|
||||
&memcpyD2H, 1));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph2, nullptr, 0, out_h, out_d, memsize,
|
||||
hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0, event1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node, &memcpyD2H, 1));
|
||||
// Instantiate graph1 and graph2
|
||||
hipStream_t streamForGraph1, streamForGraph2;
|
||||
hipGraphExec_t graphExec1, graphExec2;
|
||||
@@ -160,8 +157,8 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
|
||||
HIP_CHECK(hipStreamSynchronize(streamForGraph2));
|
||||
// Validate output
|
||||
bool btestPassed = true;
|
||||
for (uint32_t i = 0; i < LEN; i++) {
|
||||
if (out_h[i] != (inp_h[i]*inp_h[i])) {
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
if (out_h[i] != (inp_h[i] * inp_h[i])) {
|
||||
btestPassed = false;
|
||||
break;
|
||||
}
|
||||
@@ -170,10 +167,8 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
|
||||
// hipGraphExecEventWaitNodeSetEvent() TEST
|
||||
// Change the event at event_wait_node node to event2 and
|
||||
// the event at event_rec node to event2.
|
||||
HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec1,
|
||||
event_rec, event2));
|
||||
HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec2,
|
||||
event_wait_node, event2));
|
||||
HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec1, event_rec, event2));
|
||||
HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec2, event_wait_node, event2));
|
||||
// Launch graph1 and graph2
|
||||
HIP_CHECK(hipGraphLaunch(graphExec1, streamForGraph1));
|
||||
HIP_CHECK(hipGraphLaunch(graphExec2, streamForGraph2));
|
||||
@@ -181,8 +176,8 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
|
||||
HIP_CHECK(hipStreamSynchronize(streamForGraph2));
|
||||
// Validate output
|
||||
btestPassed = true;
|
||||
for (uint32_t i = 0; i < LEN; i++) {
|
||||
if (out_h[i] != (inp_h[i]*inp_h[i])) {
|
||||
for (uint32_t i = 0; i < N; i++) {
|
||||
if (out_h[i] != (inp_h[i] * inp_h[i])) {
|
||||
btestPassed = false;
|
||||
break;
|
||||
}
|
||||
@@ -214,12 +209,10 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_VerifyEventNotChanged") {
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
HIP_CHECK(hipEventCreate(&event2));
|
||||
hipGraphNode_t eventwait;
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1));
|
||||
hipGraphExec_t graphExec;
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec,
|
||||
eventwait, event2));
|
||||
HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, event2));
|
||||
HIP_CHECK(hipGraphEventWaitNodeGetEvent(eventwait, &event_out));
|
||||
// validate set event and get event are same
|
||||
REQUIRE(event1 == event_out);
|
||||
@@ -240,13 +233,11 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_Negative") {
|
||||
HIP_CHECK(hipEventCreate(&event1));
|
||||
HIP_CHECK(hipEventCreate(&event2));
|
||||
hipGraphNode_t eventrec, eventwait;
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
|
||||
event1));
|
||||
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
|
||||
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1));
|
||||
// Create memset
|
||||
constexpr size_t Nbytes = 1024;
|
||||
char *A_d;
|
||||
char* A_d;
|
||||
hipGraphNode_t memset_A;
|
||||
hipMemsetParams memsetParams{};
|
||||
HIP_CHECK(hipMalloc(&A_d, Nbytes));
|
||||
@@ -262,62 +253,59 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_Negative") {
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
|
||||
SECTION("hGraphExec = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventWaitNodeSetEvent(nullptr, eventwait, event2));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(nullptr, eventwait, event2),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("hNode = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventWaitNodeSetEvent(graphExec, nullptr, event2));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, nullptr, event2),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("event = nullptr") {
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, nullptr));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, nullptr),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("hGraphExec is uninitialized") {
|
||||
hipGraphExec_t graphExec1{};
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventWaitNodeSetEvent(graphExec1, eventwait, event2));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec1, eventwait, event2),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("hNode is uninitialized") {
|
||||
hipGraphNode_t dummy{};
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventWaitNodeSetEvent(graphExec, dummy, event2));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, dummy, event2),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("event is uninitialized") {
|
||||
hipEvent_t event_dummy{};
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait,
|
||||
event_dummy));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, event_dummy),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("event wait node does not exist") {
|
||||
hipGraph_t graph1;
|
||||
HIP_CHECK(hipGraphCreate(&graph1, 0));
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph1, nullptr, 0,
|
||||
&memsetParams));
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph1, nullptr, 0, &memsetParams));
|
||||
hipGraphExec_t graphExec1;
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventWaitNodeSetEvent(graphExec1, eventwait, event2));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec1, eventwait, event2),
|
||||
hipErrorInvalidValue);
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec1));
|
||||
HIP_CHECK(hipGraphDestroy(graph1));
|
||||
}
|
||||
|
||||
SECTION("pass memset node as hNode") {
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventWaitNodeSetEvent(graphExec, memset_A, event2));
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, memset_A, event2),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
SECTION("pass event record node as hNode") {
|
||||
REQUIRE(hipErrorInvalidValue ==
|
||||
hipGraphExecEventWaitNodeSetEvent(graphExec, eventrec, event2));
|
||||
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, eventrec, event2),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipFree(A_d));
|
||||
|
||||
@@ -27,22 +27,6 @@ THE SOFTWARE.
|
||||
* and perform the update if possible.
|
||||
*/
|
||||
|
||||
/**
|
||||
Testcase Scenarios :
|
||||
Functional-
|
||||
1) Make a clone of the created graph and update the executable-graph from a clone or same graph again.
|
||||
2) Update the executable-graph from a graph and make sure they are taking effect.
|
||||
Negative-
|
||||
1) When Pass hGraphExec as nullptr and verify api returns error code.
|
||||
2) When Pass hGraph as nullptr and verify api returns error code.
|
||||
3) When Pass hErrorNode_out as nullptr and verify api returns error code.
|
||||
4) When Pass updateResult_out as nullptr and verify api returns error code.
|
||||
5) When the a graphExec was updated with with different type of node and verify api returns error code.
|
||||
6) When a node is deleted in hGraph but not its pair from hGraphExec and verify api returns error code.
|
||||
7) When a node is deleted in hGraphExec but not its pair from hGraph and verify api returns error code.
|
||||
8) When grpah dependencies differ but graph have same node and verify api returns error code.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <hip_test_kernels.hh>
|
||||
@@ -65,13 +49,11 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Basic") {
|
||||
hipGraphNode_t hErrorNode_out{};
|
||||
hipGraphExecUpdateResult updateResult_out{};
|
||||
SECTION("Pass hGraphExec as nullptr") {
|
||||
ret = hipGraphExecUpdate(nullptr, graph, &hErrorNode_out,
|
||||
&updateResult_out);
|
||||
ret = hipGraphExecUpdate(nullptr, graph, &hErrorNode_out, &updateResult_out);
|
||||
REQUIRE(hipErrorInvalidValue == ret);
|
||||
}
|
||||
SECTION("Pass hGraph as nullptr") {
|
||||
ret = hipGraphExecUpdate(graphExec, nullptr, &hErrorNode_out,
|
||||
&updateResult_out);
|
||||
ret = hipGraphExecUpdate(graphExec, nullptr, &hErrorNode_out, &updateResult_out);
|
||||
REQUIRE(hipErrorInvalidValue == ret);
|
||||
}
|
||||
SECTION("Pass hErrorNode_out as nullptr") {
|
||||
@@ -101,10 +83,9 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_TypeChange") {
|
||||
constexpr size_t N = 1024;
|
||||
constexpr size_t Nbytes = N * sizeof(char);
|
||||
constexpr size_t val = 0;
|
||||
char *devData;
|
||||
char* devData;
|
||||
int *A_d, *A_h;
|
||||
HipTest::initArrays<int>(&A_d, nullptr, nullptr,
|
||||
&A_h, nullptr, nullptr, N, false);
|
||||
HipTest::initArrays<int>(&A_d, nullptr, nullptr, &A_h, nullptr, nullptr, N, false);
|
||||
HIP_CHECK(hipMalloc(&devData, Nbytes));
|
||||
hipGraph_t graph, graph2;
|
||||
hipGraphExec_t graphExec;
|
||||
@@ -122,18 +103,16 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_TypeChange") {
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0, &memsetParams));
|
||||
std::vector<hipGraphNode_t> dependencies;
|
||||
dependencies.push_back(memsetNode);
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphCreate(&graph2, 0));
|
||||
HIP_CHECK(hipStreamCreate(&streamForGraph));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
// graphExec was created before memcpyTemp was added to graph.
|
||||
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
|
||||
&updateResult_out);
|
||||
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out);
|
||||
REQUIRE(hipGraphExecUpdateErrorNodeTypeChanged == updateResult_out);
|
||||
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
|
||||
HIP_CHECK(hipFree(devData));
|
||||
@@ -164,7 +143,7 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_CountDiffer") {
|
||||
int *A_d, *B_d, *C_d;
|
||||
int *A_h, *B_h, *C_h;
|
||||
size_t NElem{N};
|
||||
int *hData = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
int* hData = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
REQUIRE(hData != nullptr);
|
||||
memset(hData, 0, Nbytes);
|
||||
hipGraphNode_t memcpy_A, memcpy_B, memcpy_C, memcpyTemp;
|
||||
@@ -180,57 +159,52 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_CountDiffer") {
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
HIP_CHECK(hipGraphCreate(&graph1, 0));
|
||||
HIP_CHECK(hipStreamCreate(&streamForGraph));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d,
|
||||
Nbytes, hipMemcpyDeviceToHost));
|
||||
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, Nbytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
||||
kernelNodeParams.gridDim = dim3(blocks);
|
||||
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
||||
kernelNodeParams.sharedMemBytes = 0;
|
||||
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
||||
kernelNodeParams.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph1, nullptr, 0,
|
||||
&kernelNodeParams));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph1, nullptr, 0, &kernelNodeParams));
|
||||
// Create dependencies
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &kernel_vecAdd, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &kernel_vecAdd, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &kernel_vecAdd, &memcpy_C, 1));
|
||||
// Create a cloned graph and added extra node to it
|
||||
HIP_CHECK(hipGraphClone(&graph2, graph1));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyTemp, graph2, nullptr, 0,
|
||||
C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyTemp, graph2, nullptr, 0, C_h, C_d, Nbytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0));
|
||||
SECTION("When a node deleted from Graph but not from its pair GraphExec") {
|
||||
ret = hipGraphExecUpdate(graphExec2, graph1, &hErrorNode_out,
|
||||
&updateResult_out);
|
||||
ret = hipGraphExecUpdate(graphExec2, graph1, &hErrorNode_out, &updateResult_out);
|
||||
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
|
||||
}
|
||||
SECTION("When a node deleted from GraphExec but not from its pair Graph") {
|
||||
ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out,
|
||||
&updateResult_out);
|
||||
ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out, &updateResult_out);
|
||||
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
|
||||
}
|
||||
SECTION("When the dependent nodes of a pair differ") {
|
||||
HIP_CHECK(hipGraphCreate(&graph3, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph3, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_d, B_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph3, nullptr, 0, C_h, C_d,
|
||||
Nbytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph3, nullptr, 0,
|
||||
&kernelNodeParams));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph3, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph3, nullptr, 0, C_h, C_d, Nbytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph3, nullptr, 0, &kernelNodeParams));
|
||||
// Create dependencies
|
||||
HIP_CHECK(hipGraphAddDependencies(graph3, &memcpy_A, &kernel_vecAdd, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph3, &memcpy_B, &kernel_vecAdd, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph3, &memcpy_C, &kernel_vecAdd, 1));
|
||||
ret = hipGraphExecUpdate(graphExec1, graph3, &hErrorNode_out,
|
||||
&updateResult_out);
|
||||
ret = hipGraphExecUpdate(graphExec1, graph3, &hErrorNode_out, &updateResult_out);
|
||||
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
|
||||
HIP_CHECK(hipGraphDestroy(graph3));
|
||||
}
|
||||
@@ -265,7 +239,7 @@ TEST_CASE("Unit_hipGraphExecUpdate_Functional") {
|
||||
int *A_d, *B_d, *C_d;
|
||||
int *A_h, *B_h, *C_h;
|
||||
size_t NElem{N};
|
||||
int *hData = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
int* hData = reinterpret_cast<int*>(malloc(Nbytes));
|
||||
REQUIRE(hData != nullptr);
|
||||
memset(hData, 0, Nbytes);
|
||||
hipGraphNode_t memcpy_A, memcpy_B, memcpy_C;
|
||||
@@ -280,22 +254,20 @@ TEST_CASE("Unit_hipGraphExecUpdate_Functional") {
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
HIP_CHECK(hipStreamCreate(&streamForGraph));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph, nullptr, 0, B_d, B_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph, nullptr, 0, C_h, C_d,
|
||||
Nbytes, hipMemcpyDeviceToHost));
|
||||
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
|
||||
kernelNodeParams.func =
|
||||
reinterpret_cast<void *>(HipTest::vector_square<int>);
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph, nullptr, 0, B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph, nullptr, 0, C_h, C_d, Nbytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
|
||||
kernelNodeParams.gridDim = dim3(blocks);
|
||||
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
||||
kernelNodeParams.sharedMemBytes = 0;
|
||||
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
||||
kernelNodeParams.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSquare, graph, nullptr, 0,
|
||||
&kernelNodeParams));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSquare, graph, nullptr, 0, &kernelNodeParams));
|
||||
// Create dependencies
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &memcpy_A, &kernel_vecSquare, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph, &memcpy_B, &kernel_vecSquare, 1));
|
||||
@@ -304,36 +276,32 @@ TEST_CASE("Unit_hipGraphExecUpdate_Functional") {
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
SECTION("Update graphExec with clone graph") {
|
||||
HIP_CHECK(hipGraphClone(&clonedgraph, graph));
|
||||
HIP_CHECK(hipGraphExecUpdate(graphExec, clonedgraph, &hErrorNode_out,
|
||||
&updateResult_out));
|
||||
HIP_CHECK(hipGraphExecUpdate(graphExec, clonedgraph, &hErrorNode_out, &updateResult_out));
|
||||
}
|
||||
// Code for new graph creation with samilar node setup
|
||||
HIP_CHECK(hipGraphCreate(&graph2, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d,
|
||||
Nbytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphMemcpyNodeSetParams1D(memcpy_C, hData, C_d, Nbytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, Nbytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphMemcpyNodeSetParams1D(memcpy_C, hData, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
memset(&kernelNodeParams, 0, sizeof(hipKernelNodeParams));
|
||||
void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
|
||||
void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
||||
kernelNodeParams.gridDim = dim3(blocks);
|
||||
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
||||
kernelNodeParams.sharedMemBytes = 0;
|
||||
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs2);
|
||||
kernelNodeParams.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph2, nullptr, 0,
|
||||
&kernelNodeParams));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph2, nullptr, 0, &kernelNodeParams));
|
||||
// Create dependencies
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &kernel_vecAdd, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B, &kernel_vecAdd, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &kernel_vecAdd, &memcpy_C, 1));
|
||||
// Update the graphExec graph from graph -> graph2
|
||||
HIP_CHECK(hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
|
||||
&updateResult_out));
|
||||
HIP_CHECK(hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out));
|
||||
REQUIRE(updateResult_out == hipGraphExecUpdateSuccess);
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
|
||||
HIP_CHECK(hipStreamSynchronize(streamForGraph));
|
||||
@@ -380,24 +348,22 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_ParametersChanged") {
|
||||
hipGraphExecUpdateResult updateResult_out;
|
||||
HipTest::initArrays<int>(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
||||
HIP_CHECK(hipGraphCreate(&graph1, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
|
||||
SECTION("Update graphExec with similar graph and verify") {
|
||||
HIP_CHECK(hipGraphCreate(&graph2, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out,
|
||||
&updateResult_out);
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out, &updateResult_out);
|
||||
REQUIRE(hipSuccess == ret);
|
||||
HIP_CHECK(hipGraphDestroy(graph2));
|
||||
}
|
||||
SECTION("Update graphExec with similar graph and verify") {
|
||||
HIP_CHECK(hipGraphCreate(&graph3, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_h, B_d,
|
||||
Nbytes, hipMemcpyDeviceToHost));
|
||||
ret = hipGraphExecUpdate(graphExec1, graph3, &hErrorNode_out,
|
||||
&updateResult_out);
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_h, B_d, Nbytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
ret = hipGraphExecUpdate(graphExec1, graph3, &hErrorNode_out, &updateResult_out);
|
||||
|
||||
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
|
||||
REQUIRE(hipGraphExecUpdateErrorParametersChanged == updateResult_out);
|
||||
@@ -437,16 +403,15 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_CountDiffer_1") {
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
||||
HIP_CHECK(hipGraphCreate(&graph1, 0));
|
||||
HIP_CHECK(hipGraphCreate(&graph2, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
|
||||
// When count of nodes directly differ in graphExec1 and graph2
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d,
|
||||
Nbytes, hipMemcpyDeviceToHost));
|
||||
ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out,
|
||||
&updateResult_out);
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, Nbytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out, &updateResult_out);
|
||||
|
||||
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
|
||||
#if HT_NVIDIA
|
||||
@@ -495,16 +460,15 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_CountDiffer_2") {
|
||||
hipGraphExecUpdateResult updateResult_out;
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
||||
HIP_CHECK(hipGraphCreate(&graph1, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
|
||||
// Delete a node from the graph
|
||||
HIP_CHECK(hipGraphDestroyNode(memcpy_B));
|
||||
SECTION("When a node deleted from Graph but not from its pair GraphExec") {
|
||||
ret = hipGraphExecUpdate(graphExec1, graph1, &hErrorNode_out,
|
||||
&updateResult_out);
|
||||
ret = hipGraphExecUpdate(graphExec1, graph1, &hErrorNode_out, &updateResult_out);
|
||||
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
|
||||
REQUIRE(hipGraphExecUpdateErrorTopologyChanged == updateResult_out);
|
||||
#if HT_NVIDIA
|
||||
@@ -513,11 +477,10 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_CountDiffer_2") {
|
||||
}
|
||||
SECTION("Update the GraphExec with similar graph where a node get deleted") {
|
||||
HIP_CHECK(hipGraphCreate(&graph2, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_d, C_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_d, C_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0));
|
||||
ret = hipGraphExecUpdate(graphExec2, graph1, &hErrorNode_out,
|
||||
&updateResult_out);
|
||||
ret = hipGraphExecUpdate(graphExec2, graph1, &hErrorNode_out, &updateResult_out);
|
||||
#if HT_NVIDIA
|
||||
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
|
||||
REQUIRE(hipGraphExecUpdateErrorNotSupported == updateResult_out);
|
||||
@@ -529,13 +492,12 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_CountDiffer_2") {
|
||||
}
|
||||
SECTION("When A node is deleted in GraphExec but not its pair from Graph") {
|
||||
HIP_CHECK(hipGraphCreate(&graph3, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph3, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph3, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec3, graph3, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_d, B_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
ret = hipGraphExecUpdate(graphExec3, graph3, &hErrorNode_out,
|
||||
&updateResult_out);
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
ret = hipGraphExecUpdate(graphExec3, graph3, &hErrorNode_out, &updateResult_out);
|
||||
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
|
||||
#if HT_NVIDIA
|
||||
REQUIRE(hipGraphExecUpdateErrorNotSupported == updateResult_out);
|
||||
@@ -581,27 +543,26 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Dependent_NodesDiffer") {
|
||||
hipGraphExecUpdateResult updateResult_out;
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
||||
HIP_CHECK(hipGraphCreate(&graph1, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_d, C_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_d, C_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &memcpy_B, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &memcpy_C, 1));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0));
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph2, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_d, C_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_d, C_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &memcpy_C, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B, &memcpy_C, 1));
|
||||
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
|
||||
&updateResult_out);
|
||||
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out);
|
||||
|
||||
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
|
||||
REQUIRE(hipGraphExecUpdateErrorTopologyChanged == updateResult_out);
|
||||
@@ -642,10 +603,10 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_NodeType_Changed") {
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph1, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &memcpy_B, 1));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0));
|
||||
|
||||
@@ -658,13 +619,11 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_NodeType_Changed") {
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph2, nullptr, 0,
|
||||
&memsetParams));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph2, nullptr, 0, &memsetParams));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &memsetNode, 1));
|
||||
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
|
||||
&updateResult_out);
|
||||
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out);
|
||||
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
|
||||
#if HT_NVIDIA
|
||||
REQUIRE(hipGraphExecUpdateErrorTopologyChanged == updateResult_out);
|
||||
@@ -726,22 +685,21 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_MultiDevice_Context_Changed") {
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
HIP_CHECK(hipGraphCreate(&graph1, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d,
|
||||
Nbytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, Nbytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
hipKernelNodeParams kernelNodeParams{};
|
||||
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
|
||||
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
||||
kernelNodeParams.gridDim = dim3(blocks);
|
||||
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
||||
kernelNodeParams.sharedMemBytes = 0;
|
||||
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
||||
kernelNodeParams.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0,
|
||||
&kernelNodeParams));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0, &kernelNodeParams));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &kernel_vecADD, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &kernel_vecADD, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &kernel_vecADD, &memcpy_C, 1));
|
||||
@@ -750,27 +708,25 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_MultiDevice_Context_Changed") {
|
||||
|
||||
HIP_CHECK(hipSetDevice(1));
|
||||
HIP_CHECK(hipGraphCreate(&graph2, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d,
|
||||
Nbytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, Nbytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
memset(&kernelNodeParams, 0x00, sizeof(hipKernelNodeParams));
|
||||
void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorSUB<int>);
|
||||
void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
|
||||
kernelNodeParams.gridDim = dim3(blocks);
|
||||
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
||||
kernelNodeParams.sharedMemBytes = 0;
|
||||
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs1);
|
||||
kernelNodeParams.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0,
|
||||
&kernelNodeParams));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0, &kernelNodeParams));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &kernel_vecSUB, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B, &kernel_vecSUB, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &kernel_vecSUB, &memcpy_C, 1));
|
||||
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
|
||||
&updateResult_out);
|
||||
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out);
|
||||
|
||||
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
|
||||
REQUIRE(hipGraphExecUpdateErrorUnsupportedFunctionChange == updateResult_out);
|
||||
@@ -819,49 +775,46 @@ TEST_CASE("Unit_hipGraphExecUpdate_Functional_KernelFunction_Changed") {
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
HIP_CHECK(hipGraphCreate(&graph1, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d,
|
||||
Nbytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, Nbytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
hipKernelNodeParams kernelNodeParams{};
|
||||
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
|
||||
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
|
||||
kernelNodeParams.gridDim = dim3(blocks);
|
||||
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
||||
kernelNodeParams.sharedMemBytes = 0;
|
||||
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
||||
kernelNodeParams.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0,
|
||||
&kernelNodeParams));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0, &kernelNodeParams));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &kernel_vecADD, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &kernel_vecADD, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph1, &kernel_vecADD, &memcpy_C, 1));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0));
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph2, 0));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h,
|
||||
Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d,
|
||||
Nbytes, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes,
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, Nbytes,
|
||||
hipMemcpyDeviceToHost));
|
||||
memset(&kernelNodeParams, 0x00, sizeof(hipKernelNodeParams));
|
||||
void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorSUB<int>);
|
||||
void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
|
||||
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
|
||||
kernelNodeParams.gridDim = dim3(blocks);
|
||||
kernelNodeParams.blockDim = dim3(threadsPerBlock);
|
||||
kernelNodeParams.sharedMemBytes = 0;
|
||||
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs1);
|
||||
kernelNodeParams.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0,
|
||||
&kernelNodeParams));
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0, &kernelNodeParams));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &kernel_vecSUB, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B, &kernel_vecSUB, 1));
|
||||
HIP_CHECK(hipGraphAddDependencies(graph2, &kernel_vecSUB, &memcpy_C, 1));
|
||||
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
|
||||
&updateResult_out);
|
||||
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out);
|
||||
REQUIRE(hipSuccess == ret);
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
@@ -19,394 +19,127 @@ THE SOFTWARE.
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <hip_test_kernels.hh>
|
||||
/* Test verifies hipGraphLaunch API
|
||||
Negative scenarios -
|
||||
1) Pass graphExec as nullptr and verify api returns error code.
|
||||
2) Pass pGraphExec as nullptr and stream as hipStreamPerThread and verify api returns error code.
|
||||
3) Pass pGraphExec as empty object and verify api returns error code.
|
||||
4) Destroy executable graph and try to launch it. Make sure api should not crash and it should returns error code.
|
||||
5) Destroy stream and try to launch respective executable graph. Make sure api should not crash and it should returns error code.
|
||||
6) Destroy actual graph created and try to launch respective executable graph.
|
||||
Check api should execute properly without crash or error code.
|
||||
Functional Scenario -
|
||||
1) Check basic functionality with stream as hipStreamPerThread
|
||||
2) Test hipGraphLaunch call on multiple devices.
|
||||
3) Create a graph with multiple nodes. Create an executable graph.
|
||||
Launch the executable graph 3 times in stream simultaneously.
|
||||
Wait for stream. Validate the output. No issues should be observed
|
||||
4) Create a graph with multiple nodes. Create an executable graph.
|
||||
Verify if an executable graph be launched on null stream.
|
||||
*/
|
||||
|
||||
#define SIZE 1024
|
||||
#define TEST_LOOP_SIZE 3
|
||||
/**
|
||||
* @addtogroup hipGraphLaunch hipGraphLaunch
|
||||
* @{
|
||||
* @ingroup GraphTest
|
||||
* `hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream)` -
|
||||
* Launches an executable graph in a stream
|
||||
*/
|
||||
|
||||
TEST_CASE("Unit_hipGraphLaunch_Negative") {
|
||||
hipError_t ret;
|
||||
SECTION("Pass pGraphExec as nullptr") {
|
||||
hipStream_t stream{};
|
||||
static void HostFunctionSetToZero(void* arg) {
|
||||
int* test_number = (int*)arg;
|
||||
(*test_number) = 0;
|
||||
}
|
||||
|
||||
static void HostFunctionAddOne(void* arg) {
|
||||
int* test_number = (int*)arg;
|
||||
(*test_number) += 1;
|
||||
}
|
||||
|
||||
/* create an executable graph that will set an integer pointed to by 'number' to one*/
|
||||
static void CreateTestExecutableGraph(hipGraphExec_t* graph_exec, int* number) {
|
||||
hipGraph_t graph;
|
||||
hipGraphNode_t node_error;
|
||||
|
||||
hipGraphNode_t node_set_zero;
|
||||
hipHostNodeParams params_set_to_zero = {HostFunctionSetToZero, number};
|
||||
|
||||
hipGraphNode_t node_add_one;
|
||||
hipHostNodeParams params_set_add_one = {HostFunctionAddOne, number};
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
|
||||
HIP_CHECK(hipGraphAddHostNode(&node_set_zero, graph, nullptr, 0, ¶ms_set_to_zero));
|
||||
HIP_CHECK(hipGraphAddHostNode(&node_add_one, graph, &node_set_zero, 1, ¶ms_set_add_one));
|
||||
|
||||
HIP_CHECK(hipGraphInstantiate(graph_exec, graph, &node_error, nullptr, 0));
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
}
|
||||
|
||||
static void HipGraphLaunch_Positive_Simple(hipStream_t stream) {
|
||||
int number = 5;
|
||||
|
||||
hipGraphExec_t graph_exec;
|
||||
CreateTestExecutableGraph(&graph_exec, &number);
|
||||
|
||||
HIP_CHECK(hipGraphLaunch(graph_exec, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
REQUIRE(number == 1);
|
||||
|
||||
HIP_CHECK(hipGraphExecDestroy(graph_exec));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Basic positive test for hipGraphLaunch
|
||||
* -# stream as a created stream
|
||||
* -# with stream as hipStreamPerThread
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/graph/hipGraphLaunch.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Unit_hipGraphLaunch_Positive") {
|
||||
SECTION("stream as a created stream") {
|
||||
hipStream_t stream;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
HipGraphLaunch_Positive_Simple(stream);
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
|
||||
SECTION("with stream as hipStreamPerThread") {
|
||||
HipGraphLaunch_Positive_Simple(hipStreamPerThread);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Negative parameter test for hipGraphLaunch
|
||||
* -# graphExec is nullptr and stream is a created stream
|
||||
* -# graphExec is nullptr and stream is hipStreamPerThread
|
||||
* -# graphExec is an empty object
|
||||
* -# graphExec is destroyed before calling hipGraphLaunch
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/graph/hipGraphLaunch.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
TEST_CASE("Unit_hipGraphLaunch_Negative_Parameters") {
|
||||
SECTION("graphExec is nullptr and stream is a created stream") {
|
||||
hipStream_t stream;
|
||||
hipError_t ret;
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
ret = hipGraphLaunch(nullptr, stream);
|
||||
REQUIRE(hipErrorInvalidValue == ret);
|
||||
}
|
||||
SECTION("Pass pGraphExec as nullptr and stream as hipStreamPerThread") {
|
||||
ret = hipGraphLaunch(nullptr, hipStreamPerThread);
|
||||
REQUIRE(hipErrorInvalidValue == ret);
|
||||
}
|
||||
SECTION("Pass pGraphExec as empty object") {
|
||||
hipGraphExec_t graphExec{};
|
||||
hipStream_t stream{};
|
||||
ret = hipGraphLaunch(graphExec, stream);
|
||||
REQUIRE(hipErrorInvalidValue == ret);
|
||||
}
|
||||
SECTION("Destroy executable graph and try to launch it") {
|
||||
constexpr size_t Nbytes = 1024;
|
||||
hipGraph_t graph;
|
||||
hipGraphExec_t graphExec;
|
||||
hipStream_t stream;
|
||||
hipGraphNode_t memsetNode;
|
||||
|
||||
char *devData;
|
||||
HIP_CHECK(hipMalloc(&devData, Nbytes));
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
hipMemsetParams memsetParams{};
|
||||
memset(&memsetParams, 0, sizeof(memsetParams));
|
||||
memsetParams.dst = reinterpret_cast<void*>(devData);
|
||||
memsetParams.value = 0;
|
||||
memsetParams.pitch = 0;
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
// Launch again after destroy graph exec object.
|
||||
ret = hipGraphLaunch(graphExec, stream);
|
||||
REQUIRE(hipErrorInvalidValue == ret);
|
||||
|
||||
HIP_CHECK(hipFree(devData));
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
REQUIRE(ret == hipErrorInvalidValue);
|
||||
}
|
||||
/* In this case in CUDA setup this api call is giving - unknown error (999)
|
||||
So enabling this test for both AMD and CUDA by checking with hipSuccess */
|
||||
SECTION("Destroy stream and try to launch respective executable graph") {
|
||||
constexpr size_t Nbytes = 1024;
|
||||
hipGraph_t graph;
|
||||
hipGraphExec_t graphExec;
|
||||
hipStream_t stream;
|
||||
hipGraphNode_t memsetNode;
|
||||
|
||||
char *devData;
|
||||
HIP_CHECK(hipMalloc(&devData, Nbytes));
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
hipMemsetParams memsetParams{};
|
||||
memset(&memsetParams, 0, sizeof(memsetParams));
|
||||
memsetParams.dst = reinterpret_cast<void*>(devData);
|
||||
memsetParams.value = 0;
|
||||
memsetParams.pitch = 0;
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
// Launch again after destroy stream
|
||||
ret = hipGraphLaunch(graphExec, stream);
|
||||
REQUIRE(hipSuccess != ret);
|
||||
|
||||
HIP_CHECK(hipFree(devData));
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
SECTION("graphExec is nullptr and stream is hipStreamPerThread") {
|
||||
HIP_CHECK_ERROR(hipGraphLaunch(nullptr, hipStreamPerThread), hipErrorInvalidValue);
|
||||
}
|
||||
SECTION("Destroy graph and try to launch respective executable graph") {
|
||||
constexpr size_t Nbytes = 1024;
|
||||
hipGraph_t graph;
|
||||
hipGraphExec_t graphExec;
|
||||
hipStream_t stream;
|
||||
hipGraphNode_t memsetNode;
|
||||
|
||||
char *devData;
|
||||
HIP_CHECK(hipMalloc(&devData, Nbytes));
|
||||
SECTION("graphExec is an empty object") {
|
||||
hipGraphExec_t graph_exec{};
|
||||
HIP_CHECK_ERROR(hipGraphLaunch(graph_exec, hipStreamPerThread), hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
hipMemsetParams memsetParams{};
|
||||
memset(&memsetParams, 0, sizeof(memsetParams));
|
||||
memsetParams.dst = reinterpret_cast<void*>(devData);
|
||||
memsetParams.value = 0;
|
||||
memsetParams.pitch = 0;
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
// Launch again after destroy graph
|
||||
ret = hipGraphLaunch(graphExec, stream);
|
||||
REQUIRE(hipSuccess == ret);
|
||||
|
||||
HIP_CHECK(hipFree(devData));
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
SECTION("graphExec is destroyed") {
|
||||
int number = 5;
|
||||
hipGraphExec_t graph_exec;
|
||||
CreateTestExecutableGraph(&graph_exec, &number);
|
||||
HIP_CHECK(hipGraphLaunch(graph_exec, hipStreamPerThread));
|
||||
HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
|
||||
REQUIRE(number == 1);
|
||||
HIP_CHECK(hipGraphExecDestroy(graph_exec));
|
||||
HIP_CHECK_ERROR(hipGraphLaunch(graph_exec, hipStreamPerThread), hipErrorInvalidValue);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipGraphLaunch_Functional_hipStreamPerThread") {
|
||||
constexpr size_t N = 1024;
|
||||
constexpr size_t Nbytes = N * sizeof(char);
|
||||
constexpr size_t val = 0;
|
||||
constexpr size_t updateVal = 2;
|
||||
char *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr};
|
||||
char *A_h{nullptr}, *B_h{nullptr};
|
||||
|
||||
HipTest::initArrays<char>(&A_d, &B_d, &C_d,
|
||||
&A_h, &B_h, nullptr, N, false);
|
||||
|
||||
hipGraph_t graph;
|
||||
hipGraphExec_t graphExec;
|
||||
hipGraphNode_t memsetNode;
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
|
||||
hipMemsetParams memsetParams{};
|
||||
memset(&memsetParams, 0, sizeof(memsetParams));
|
||||
memsetParams.dst = reinterpret_cast<void*>(C_d);
|
||||
memsetParams.value = val;
|
||||
memsetParams.pitch = 0;
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
|
||||
std::vector<hipGraphNode_t> dependencies;
|
||||
dependencies.push_back(memsetNode);
|
||||
|
||||
memset(&memsetParams, 0, sizeof(memsetParams));
|
||||
memsetParams.dst = reinterpret_cast<void*>(A_d);
|
||||
memsetParams.value = updateVal;
|
||||
memsetParams.pitch = 0;
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, dependencies.data(),
|
||||
dependencies.size(), &memsetParams));
|
||||
HIP_CHECK(hipGraphMemsetNodeSetParams(memsetNode, &memsetParams));
|
||||
dependencies.push_back(memsetNode);
|
||||
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, hipStreamPerThread));
|
||||
HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
|
||||
|
||||
HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
// Validating the result
|
||||
for (size_t i = 0; i < Nbytes; i++) {
|
||||
if (A_h[i] != updateVal) {
|
||||
WARN("Validation failed at- " << i << " A_h[i] " << A_h[i]);
|
||||
REQUIRE(false);
|
||||
}
|
||||
}
|
||||
|
||||
HipTest::freeArrays<char>(A_d, B_d, C_d,
|
||||
A_h, B_h, nullptr, false);
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
}
|
||||
|
||||
static void hipGraphLaunch_test() {
|
||||
constexpr size_t N = 1024;
|
||||
constexpr size_t Nbytes = N * sizeof(char);
|
||||
constexpr size_t val = 0;
|
||||
constexpr size_t updateVal = 1;
|
||||
char *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr};
|
||||
char *A_h{nullptr}, *B_h{nullptr};
|
||||
|
||||
HipTest::initArrays<char>(&A_d, &B_d, &C_d,
|
||||
&A_h, &B_h, nullptr, N, false);
|
||||
|
||||
hipGraph_t graph;
|
||||
hipGraphExec_t graphExec;
|
||||
hipStream_t streamForGraph;
|
||||
hipGraphNode_t memsetNode;
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
HIP_CHECK(hipStreamCreate(&streamForGraph));
|
||||
|
||||
hipMemsetParams memsetParams{};
|
||||
memset(&memsetParams, 0, sizeof(memsetParams));
|
||||
memsetParams.dst = reinterpret_cast<void*>(C_d);
|
||||
memsetParams.value = val;
|
||||
memsetParams.pitch = 0;
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
|
||||
std::vector<hipGraphNode_t> dependencies;
|
||||
dependencies.push_back(memsetNode);
|
||||
|
||||
memset(&memsetParams, 0, sizeof(memsetParams));
|
||||
memsetParams.dst = reinterpret_cast<void*>(A_d);
|
||||
memsetParams.value = updateVal;
|
||||
memsetParams.pitch = 0;
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, dependencies.data(),
|
||||
dependencies.size(), &memsetParams));
|
||||
HIP_CHECK(hipGraphMemsetNodeSetParams(memsetNode, &memsetParams));
|
||||
dependencies.push_back(memsetNode);
|
||||
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
|
||||
HIP_CHECK(hipStreamSynchronize(streamForGraph));
|
||||
|
||||
HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
// Validating the result
|
||||
for (size_t i = 0; i < Nbytes; i++) {
|
||||
if (A_h[i] != updateVal) {
|
||||
WARN("Validation failed at- " << i << " A_h[i] " << A_h[i]);
|
||||
REQUIRE(false);
|
||||
}
|
||||
}
|
||||
|
||||
HipTest::freeArrays<char>(A_d, B_d, C_d,
|
||||
A_h, B_h, nullptr, false);
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
HIP_CHECK(hipStreamDestroy(streamForGraph));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipGraphLaunch_Functional_multidevice_test") {
|
||||
int numDevices = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
||||
|
||||
if (numDevices > 0) {
|
||||
for (int i = 0; i < numDevices; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
hipGraphLaunch_test();
|
||||
}
|
||||
} else {
|
||||
SUCCEED("Skipped the testcase as there is no device to test.");
|
||||
}
|
||||
}
|
||||
|
||||
// Function to fill input data
|
||||
static void fillRandInpData(int *A1_h, int *A2_h, size_t N) {
|
||||
unsigned int seed = time(nullptr);
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
A1_h[i] = (HipTest::RAND_R(&seed) & 0xFF);
|
||||
A2_h[i] = (HipTest::RAND_R(&seed) & 0xFF);
|
||||
}
|
||||
}
|
||||
// Function to validate result
|
||||
static void validateOutData(int *A1_h, int *A2_h, size_t N) {
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
int result = (A1_h[i]*A1_h[i]);
|
||||
REQUIRE(result == A2_h[i]);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* 1.Create a graph with multiple nodes. Create an executable graph.
|
||||
* Launch the executable graph 3 times in stream simultaneously.
|
||||
* Wait for stream. Validate the output. No issues should be observed
|
||||
* 2.Create a graph with multiple nodes. Create an executable graph.
|
||||
* Verify if an executable graph be launched on null stream.
|
||||
*/
|
||||
TEST_CASE("Unit_hipGraphLaunch_Functional_MultipleLaunch") {
|
||||
size_t memSize = SIZE;
|
||||
constexpr auto blocksPerCU = 6; // to hide latency
|
||||
constexpr auto threadsPerBlock = 256;
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU,
|
||||
threadsPerBlock, SIZE);
|
||||
hipGraph_t graph;
|
||||
std::vector<hipGraphNode_t> nodeDependencies;
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
int *A_h{nullptr}, *A_d{nullptr}, *C_d{nullptr}, *C_h{nullptr};
|
||||
|
||||
HipTest::initArrays<int>(&A_d, &C_d, nullptr,
|
||||
&A_h, &C_h, nullptr, SIZE, false);
|
||||
|
||||
hipGraphNode_t memcpyH2D, memcpyD2H, kernelNode;
|
||||
|
||||
// Create memcpy H2D nodes
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr,
|
||||
0, A_d, A_h, (sizeof(int)*SIZE), hipMemcpyHostToDevice));
|
||||
nodeDependencies.push_back(memcpyH2D);
|
||||
// Creating kernel node
|
||||
hipKernelNodeParams kerNodeParams;
|
||||
void* kernelArgs[] = {reinterpret_cast<void*>(&A_d),
|
||||
reinterpret_cast<void*>(&C_d),
|
||||
reinterpret_cast<void*>(&memSize)};
|
||||
kerNodeParams.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
|
||||
kerNodeParams.gridDim = dim3(blocks);
|
||||
kerNodeParams.blockDim = dim3(threadsPerBlock);
|
||||
kerNodeParams.sharedMemBytes = 0;
|
||||
kerNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
||||
kerNodeParams.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(),
|
||||
nodeDependencies.size(), &kerNodeParams));
|
||||
nodeDependencies.clear();
|
||||
nodeDependencies.push_back(kernelNode);
|
||||
|
||||
// Create memcpy D2H nodes
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nodeDependencies.data(),
|
||||
nodeDependencies.size(), C_h, C_d, (sizeof(int)*SIZE),
|
||||
hipMemcpyDeviceToHost));
|
||||
nodeDependencies.clear();
|
||||
|
||||
// Create executable graph
|
||||
hipStream_t streamForGraph;
|
||||
hipGraphExec_t graphExec{nullptr};
|
||||
HIP_CHECK(hipStreamCreate(&streamForGraph));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr,
|
||||
nullptr, 0));
|
||||
// Execute graph
|
||||
SECTION("Multiple Graph Launch") {
|
||||
for (int iter = 0; iter < TEST_LOOP_SIZE; iter++) {
|
||||
fillRandInpData(A_h, C_h, SIZE);
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
|
||||
HIP_CHECK(hipStreamSynchronize(streamForGraph));
|
||||
validateOutData(A_h, C_h, SIZE);
|
||||
}
|
||||
}
|
||||
SECTION("Graph launch on Null stream") {
|
||||
for (int iter = 0; iter < TEST_LOOP_SIZE; iter++) {
|
||||
fillRandInpData(A_h, C_h, SIZE);
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, 0));
|
||||
HIP_CHECK(hipStreamSynchronize(0));
|
||||
validateOutData(A_h, C_h, SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
HIP_CHECK(hipStreamDestroy(streamForGraph));
|
||||
|
||||
// Free
|
||||
HipTest::freeArrays<int>(A_d, C_d, nullptr, A_h, C_h, nullptr, false);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,412 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip_test_checkers.hh>
|
||||
#include <hip_test_kernels.hh>
|
||||
/* Test verifies hipGraphLaunch API
|
||||
Negative scenarios -
|
||||
1) Pass graphExec as nullptr and verify api returns error code.
|
||||
2) Pass pGraphExec as nullptr and stream as hipStreamPerThread and verify api returns error code.
|
||||
3) Pass pGraphExec as empty object and verify api returns error code.
|
||||
4) Destroy executable graph and try to launch it. Make sure api should not crash and it should returns error code.
|
||||
5) Destroy stream and try to launch respective executable graph. Make sure api should not crash and it should returns error code.
|
||||
6) Destroy actual graph created and try to launch respective executable graph.
|
||||
Check api should execute properly without crash or error code.
|
||||
Functional Scenario -
|
||||
1) Check basic functionality with stream as hipStreamPerThread
|
||||
2) Test hipGraphLaunch call on multiple devices.
|
||||
3) Create a graph with multiple nodes. Create an executable graph.
|
||||
Launch the executable graph 3 times in stream simultaneously.
|
||||
Wait for stream. Validate the output. No issues should be observed
|
||||
4) Create a graph with multiple nodes. Create an executable graph.
|
||||
Verify if an executable graph be launched on null stream.
|
||||
*/
|
||||
|
||||
#define SIZE 1024
|
||||
#define TEST_LOOP_SIZE 3
|
||||
|
||||
TEST_CASE("Unit_hipGraphLaunch_Negative") {
|
||||
hipError_t ret;
|
||||
SECTION("Pass pGraphExec as nullptr") {
|
||||
hipStream_t stream{};
|
||||
ret = hipGraphLaunch(nullptr, stream);
|
||||
REQUIRE(hipErrorInvalidValue == ret);
|
||||
}
|
||||
SECTION("Pass pGraphExec as nullptr and stream as hipStreamPerThread") {
|
||||
ret = hipGraphLaunch(nullptr, hipStreamPerThread);
|
||||
REQUIRE(hipErrorInvalidValue == ret);
|
||||
}
|
||||
SECTION("Pass pGraphExec as empty object") {
|
||||
hipGraphExec_t graphExec{};
|
||||
hipStream_t stream{};
|
||||
ret = hipGraphLaunch(graphExec, stream);
|
||||
REQUIRE(hipErrorInvalidValue == ret);
|
||||
}
|
||||
SECTION("Destroy executable graph and try to launch it") {
|
||||
constexpr size_t Nbytes = 1024;
|
||||
hipGraph_t graph;
|
||||
hipGraphExec_t graphExec;
|
||||
hipStream_t stream;
|
||||
hipGraphNode_t memsetNode;
|
||||
|
||||
char *devData;
|
||||
HIP_CHECK(hipMalloc(&devData, Nbytes));
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
hipMemsetParams memsetParams{};
|
||||
memset(&memsetParams, 0, sizeof(memsetParams));
|
||||
memsetParams.dst = reinterpret_cast<void*>(devData);
|
||||
memsetParams.value = 0;
|
||||
memsetParams.pitch = 0;
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
// Launch again after destroy graph exec object.
|
||||
ret = hipGraphLaunch(graphExec, stream);
|
||||
REQUIRE(hipErrorInvalidValue == ret);
|
||||
|
||||
HIP_CHECK(hipFree(devData));
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
/* In this case in CUDA setup this api call is giving - unknown error (999)
|
||||
So enabling this test for both AMD and CUDA by checking with hipSuccess */
|
||||
SECTION("Destroy stream and try to launch respective executable graph") {
|
||||
constexpr size_t Nbytes = 1024;
|
||||
hipGraph_t graph;
|
||||
hipGraphExec_t graphExec;
|
||||
hipStream_t stream;
|
||||
hipGraphNode_t memsetNode;
|
||||
|
||||
char *devData;
|
||||
HIP_CHECK(hipMalloc(&devData, Nbytes));
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
hipMemsetParams memsetParams{};
|
||||
memset(&memsetParams, 0, sizeof(memsetParams));
|
||||
memsetParams.dst = reinterpret_cast<void*>(devData);
|
||||
memsetParams.value = 0;
|
||||
memsetParams.pitch = 0;
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
// Launch again after destroy stream
|
||||
ret = hipGraphLaunch(graphExec, stream);
|
||||
REQUIRE(hipSuccess != ret);
|
||||
|
||||
HIP_CHECK(hipFree(devData));
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
}
|
||||
SECTION("Destroy graph and try to launch respective executable graph") {
|
||||
constexpr size_t Nbytes = 1024;
|
||||
hipGraph_t graph;
|
||||
hipGraphExec_t graphExec;
|
||||
hipStream_t stream;
|
||||
hipGraphNode_t memsetNode;
|
||||
|
||||
char *devData;
|
||||
HIP_CHECK(hipMalloc(&devData, Nbytes));
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
hipMemsetParams memsetParams{};
|
||||
memset(&memsetParams, 0, sizeof(memsetParams));
|
||||
memsetParams.dst = reinterpret_cast<void*>(devData);
|
||||
memsetParams.value = 0;
|
||||
memsetParams.pitch = 0;
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
// Launch again after destroy graph
|
||||
ret = hipGraphLaunch(graphExec, stream);
|
||||
REQUIRE(hipSuccess == ret);
|
||||
|
||||
HIP_CHECK(hipFree(devData));
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipGraphLaunch_Functional_hipStreamPerThread") {
|
||||
constexpr size_t N = 1024;
|
||||
constexpr size_t Nbytes = N * sizeof(char);
|
||||
constexpr size_t val = 0;
|
||||
constexpr size_t updateVal = 2;
|
||||
char *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr};
|
||||
char *A_h{nullptr}, *B_h{nullptr};
|
||||
|
||||
HipTest::initArrays<char>(&A_d, &B_d, &C_d,
|
||||
&A_h, &B_h, nullptr, N, false);
|
||||
|
||||
hipGraph_t graph;
|
||||
hipGraphExec_t graphExec;
|
||||
hipGraphNode_t memsetNode;
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
|
||||
hipMemsetParams memsetParams{};
|
||||
memset(&memsetParams, 0, sizeof(memsetParams));
|
||||
memsetParams.dst = reinterpret_cast<void*>(C_d);
|
||||
memsetParams.value = val;
|
||||
memsetParams.pitch = 0;
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
|
||||
std::vector<hipGraphNode_t> dependencies;
|
||||
dependencies.push_back(memsetNode);
|
||||
|
||||
memset(&memsetParams, 0, sizeof(memsetParams));
|
||||
memsetParams.dst = reinterpret_cast<void*>(A_d);
|
||||
memsetParams.value = updateVal;
|
||||
memsetParams.pitch = 0;
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, dependencies.data(),
|
||||
dependencies.size(), &memsetParams));
|
||||
HIP_CHECK(hipGraphMemsetNodeSetParams(memsetNode, &memsetParams));
|
||||
dependencies.push_back(memsetNode);
|
||||
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, hipStreamPerThread));
|
||||
HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
|
||||
|
||||
HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
// Validating the result
|
||||
for (size_t i = 0; i < Nbytes; i++) {
|
||||
if (A_h[i] != updateVal) {
|
||||
WARN("Validation failed at- " << i << " A_h[i] " << A_h[i]);
|
||||
REQUIRE(false);
|
||||
}
|
||||
}
|
||||
|
||||
HipTest::freeArrays<char>(A_d, B_d, C_d,
|
||||
A_h, B_h, nullptr, false);
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
}
|
||||
|
||||
static void hipGraphLaunch_test() {
|
||||
constexpr size_t N = 1024;
|
||||
constexpr size_t Nbytes = N * sizeof(char);
|
||||
constexpr size_t val = 0;
|
||||
constexpr size_t updateVal = 1;
|
||||
char *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr};
|
||||
char *A_h{nullptr}, *B_h{nullptr};
|
||||
|
||||
HipTest::initArrays<char>(&A_d, &B_d, &C_d,
|
||||
&A_h, &B_h, nullptr, N, false);
|
||||
|
||||
hipGraph_t graph;
|
||||
hipGraphExec_t graphExec;
|
||||
hipStream_t streamForGraph;
|
||||
hipGraphNode_t memsetNode;
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
HIP_CHECK(hipStreamCreate(&streamForGraph));
|
||||
|
||||
hipMemsetParams memsetParams{};
|
||||
memset(&memsetParams, 0, sizeof(memsetParams));
|
||||
memsetParams.dst = reinterpret_cast<void*>(C_d);
|
||||
memsetParams.value = val;
|
||||
memsetParams.pitch = 0;
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
|
||||
&memsetParams));
|
||||
|
||||
std::vector<hipGraphNode_t> dependencies;
|
||||
dependencies.push_back(memsetNode);
|
||||
|
||||
memset(&memsetParams, 0, sizeof(memsetParams));
|
||||
memsetParams.dst = reinterpret_cast<void*>(A_d);
|
||||
memsetParams.value = updateVal;
|
||||
memsetParams.pitch = 0;
|
||||
memsetParams.elementSize = sizeof(char);
|
||||
memsetParams.width = Nbytes;
|
||||
memsetParams.height = 1;
|
||||
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, dependencies.data(),
|
||||
dependencies.size(), &memsetParams));
|
||||
HIP_CHECK(hipGraphMemsetNodeSetParams(memsetNode, &memsetParams));
|
||||
dependencies.push_back(memsetNode);
|
||||
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
|
||||
HIP_CHECK(hipStreamSynchronize(streamForGraph));
|
||||
|
||||
HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
// Validating the result
|
||||
for (size_t i = 0; i < Nbytes; i++) {
|
||||
if (A_h[i] != updateVal) {
|
||||
WARN("Validation failed at- " << i << " A_h[i] " << A_h[i]);
|
||||
REQUIRE(false);
|
||||
}
|
||||
}
|
||||
|
||||
HipTest::freeArrays<char>(A_d, B_d, C_d,
|
||||
A_h, B_h, nullptr, false);
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
HIP_CHECK(hipStreamDestroy(streamForGraph));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipGraphLaunch_Functional_multidevice_test") {
|
||||
int numDevices = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
||||
|
||||
if (numDevices > 0) {
|
||||
for (int i = 0; i < numDevices; i++) {
|
||||
HIP_CHECK(hipSetDevice(i));
|
||||
hipGraphLaunch_test();
|
||||
}
|
||||
} else {
|
||||
SUCCEED("Skipped the testcase as there is no device to test.");
|
||||
}
|
||||
}
|
||||
|
||||
// Function to fill input data
|
||||
static void fillRandInpData(int *A1_h, int *A2_h, size_t N) {
|
||||
unsigned int seed = time(nullptr);
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
A1_h[i] = (HipTest::RAND_R(&seed) & 0xFF);
|
||||
A2_h[i] = (HipTest::RAND_R(&seed) & 0xFF);
|
||||
}
|
||||
}
|
||||
// Function to validate result
|
||||
static void validateOutData(int *A1_h, int *A2_h, size_t N) {
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
int result = (A1_h[i]*A1_h[i]);
|
||||
REQUIRE(result == A2_h[i]);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* 1.Create a graph with multiple nodes. Create an executable graph.
|
||||
* Launch the executable graph 3 times in stream simultaneously.
|
||||
* Wait for stream. Validate the output. No issues should be observed
|
||||
* 2.Create a graph with multiple nodes. Create an executable graph.
|
||||
* Verify if an executable graph be launched on null stream.
|
||||
*/
|
||||
TEST_CASE("Unit_hipGraphLaunch_Functional_MultipleLaunch") {
|
||||
size_t memSize = SIZE;
|
||||
constexpr auto blocksPerCU = 6; // to hide latency
|
||||
constexpr auto threadsPerBlock = 256;
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU,
|
||||
threadsPerBlock, SIZE);
|
||||
hipGraph_t graph;
|
||||
std::vector<hipGraphNode_t> nodeDependencies;
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
int *A_h{nullptr}, *A_d{nullptr}, *C_d{nullptr}, *C_h{nullptr};
|
||||
|
||||
HipTest::initArrays<int>(&A_d, &C_d, nullptr,
|
||||
&A_h, &C_h, nullptr, SIZE, false);
|
||||
|
||||
hipGraphNode_t memcpyH2D, memcpyD2H, kernelNode;
|
||||
|
||||
// Create memcpy H2D nodes
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr,
|
||||
0, A_d, A_h, (sizeof(int)*SIZE), hipMemcpyHostToDevice));
|
||||
nodeDependencies.push_back(memcpyH2D);
|
||||
// Creating kernel node
|
||||
hipKernelNodeParams kerNodeParams;
|
||||
void* kernelArgs[] = {reinterpret_cast<void*>(&A_d),
|
||||
reinterpret_cast<void*>(&C_d),
|
||||
reinterpret_cast<void*>(&memSize)};
|
||||
kerNodeParams.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
|
||||
kerNodeParams.gridDim = dim3(blocks);
|
||||
kerNodeParams.blockDim = dim3(threadsPerBlock);
|
||||
kerNodeParams.sharedMemBytes = 0;
|
||||
kerNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
|
||||
kerNodeParams.extra = nullptr;
|
||||
HIP_CHECK(hipGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(),
|
||||
nodeDependencies.size(), &kerNodeParams));
|
||||
nodeDependencies.clear();
|
||||
nodeDependencies.push_back(kernelNode);
|
||||
|
||||
// Create memcpy D2H nodes
|
||||
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nodeDependencies.data(),
|
||||
nodeDependencies.size(), C_h, C_d, (sizeof(int)*SIZE),
|
||||
hipMemcpyDeviceToHost));
|
||||
nodeDependencies.clear();
|
||||
|
||||
// Create executable graph
|
||||
hipStream_t streamForGraph;
|
||||
hipGraphExec_t graphExec{nullptr};
|
||||
HIP_CHECK(hipStreamCreate(&streamForGraph));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr,
|
||||
nullptr, 0));
|
||||
// Execute graph
|
||||
SECTION("Multiple Graph Launch") {
|
||||
for (int iter = 0; iter < TEST_LOOP_SIZE; iter++) {
|
||||
fillRandInpData(A_h, C_h, SIZE);
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
|
||||
HIP_CHECK(hipStreamSynchronize(streamForGraph));
|
||||
validateOutData(A_h, C_h, SIZE);
|
||||
}
|
||||
}
|
||||
SECTION("Graph launch on Null stream") {
|
||||
for (int iter = 0; iter < TEST_LOOP_SIZE; iter++) {
|
||||
fillRandInpData(A_h, C_h, SIZE);
|
||||
HIP_CHECK(hipGraphLaunch(graphExec, 0));
|
||||
HIP_CHECK(hipStreamSynchronize(0));
|
||||
validateOutData(A_h, C_h, SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGraphDestroy(graph));
|
||||
HIP_CHECK(hipGraphExecDestroy(graphExec));
|
||||
HIP_CHECK(hipStreamDestroy(streamForGraph));
|
||||
|
||||
// Free
|
||||
HipTest::freeArrays<int>(A_d, C_d, nullptr, A_h, C_h, nullptr, false);
|
||||
}
|
||||
@@ -261,9 +261,10 @@ TEST_CASE("Unit_hipGraphUpload_Functional_With_Priority_Stream") {
|
||||
1) Pass graphExec node as nullptr.
|
||||
2) Pass graphExec node as uninitialize object
|
||||
3) Pass stream as uninitialize object
|
||||
4) Graphexec is destroyed before upload
|
||||
*/
|
||||
|
||||
TEST_CASE("Unit_hipGraphUpload_Negative_Argument_Check") {
|
||||
TEST_CASE("Unit_hipGraphUpload_Negative_Parameters") {
|
||||
hipGraphExec_t graphExec{};
|
||||
hipError_t ret;
|
||||
|
||||
@@ -271,21 +272,30 @@ TEST_CASE("Unit_hipGraphUpload_Negative_Argument_Check") {
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
|
||||
SECTION("Pass graphExec node as nullptr") {
|
||||
ret = hipGraphUpload(nullptr, stream);
|
||||
REQUIRE(hipErrorInvalidValue == ret);
|
||||
HIP_CHECK_ERROR(hipGraphUpload(nullptr, stream), hipErrorInvalidValue);
|
||||
}
|
||||
SECTION("Pass graphExec node as uninitialize object") {
|
||||
ret = hipGraphUpload(graphExec, stream);
|
||||
REQUIRE(hipErrorInvalidValue == ret);
|
||||
HIP_CHECK_ERROR(hipGraphUpload(graphExec, stream), hipErrorInvalidValue);
|
||||
}
|
||||
SECTION("Pass stream as uninitialize object") {
|
||||
hipStream_t stream1{};
|
||||
hipGraph_t graph;
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
|
||||
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
|
||||
|
||||
ret = hipGraphUpload(graphExec, stream1);
|
||||
REQUIRE(hipSuccess == ret);
|
||||
}
|
||||
SECTION("graphExec is destroyed"){
|
||||
hipGraphExec_t graph_exec;
|
||||
hipGraph_t graph;
|
||||
|
||||
HIP_CHECK(hipGraphCreate(&graph, 0));
|
||||
HIP_CHECK(hipGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
|
||||
|
||||
HIP_CHECK(hipGraphUpload(graph_exec, hipStreamPerThread));
|
||||
HIP_CHECK(hipGraphExecDestroy(graph_exec));
|
||||
HIP_CHECK_ERROR(hipGraphUpload(graph_exec, hipStreamPerThread), hipErrorInvalidValue);
|
||||
}
|
||||
HIP_CHECK(hipStreamDestroy(stream));
|
||||
}
|
||||
|
||||
@@ -4,9 +4,23 @@ set(TEST_SRC
|
||||
hipOccupancyMaxActiveBlocksPerMultiprocessor_old.cc
|
||||
hipOccupancyMaxPotentialBlockSize.cc
|
||||
hipOccupancyMaxPotentialBlockSize_old.cc
|
||||
hipModuleOccupancyMaxPotentialBlockSize.cc
|
||||
hipModuleOccupancyMaxPotentialBlockSizeWithFlags.cc
|
||||
hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.cc
|
||||
hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.cc
|
||||
hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags.cc
|
||||
)
|
||||
|
||||
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/simple_kernel.code
|
||||
COMMAND ${CMAKE_CXX_COMPILER} --genco --std=c++17
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/simple_kernel.cc
|
||||
-o simple_kernel.code --rocm-path=${ROCM_PATH}
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/simple_kernel.cc)
|
||||
|
||||
add_custom_target(simple_kernel ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/simple_kernel.code)
|
||||
|
||||
hip_add_exe_to_target(NAME OccupancyTest
|
||||
TEST_SRC ${TEST_SRC}
|
||||
TEST_TARGET_NAME build_tests)
|
||||
|
||||
add_dependencies(OccupancyTest simple_kernel)
|
||||
|
||||
+92
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
/*
|
||||
Testcase Scenarios :
|
||||
Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Positive_RangeValidation - Test correct
|
||||
execution of hipModuleOccupancyMaxActiveBlocksPerMultiprocessor for diffrent parameter values
|
||||
Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters - Test unsuccessful
|
||||
execution of hipModuleOccupancyMaxActiveBlocksPerMultiprocessor api when parameters are invalid
|
||||
*/
|
||||
#include "occupancy_common.hh"
|
||||
|
||||
TEST_CASE("Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters") {
|
||||
hipModule_t module;
|
||||
hipFunction_t function;
|
||||
int blockSize = 0;
|
||||
int gridSize = 0;
|
||||
|
||||
HIP_CHECK(hipFree(nullptr));
|
||||
|
||||
HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
|
||||
HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
|
||||
|
||||
// Get potential blocksize
|
||||
HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, 0, 0));
|
||||
|
||||
// Common negative tests
|
||||
MaxActiveBlocksPerMultiprocessorNegative(
|
||||
[&function](int* numBlocks, int blockSize, size_t dynSharedMemPerBlk) {
|
||||
return hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, function, blockSize,
|
||||
dynSharedMemPerBlk);
|
||||
},
|
||||
blockSize);
|
||||
|
||||
HIP_CHECK(hipModuleUnload(module));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Positive_RangeValidation") {
|
||||
hipDeviceProp_t devProp;
|
||||
hipModule_t module;
|
||||
hipFunction_t function;
|
||||
int blockSize = 0;
|
||||
int gridSize = 0;
|
||||
|
||||
HIP_CHECK(hipFree(nullptr));
|
||||
|
||||
HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
|
||||
HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
|
||||
|
||||
HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
|
||||
|
||||
SECTION("dynSharedMemPerBlk = 0") {
|
||||
// Get potential blocksize
|
||||
HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, 0, 0));
|
||||
|
||||
MaxActiveBlocksPerMultiprocessor(
|
||||
[blockSize, &function](int* numBlocks) {
|
||||
return hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, function, blockSize,
|
||||
0);
|
||||
},
|
||||
blockSize, devProp.maxThreadsPerMultiProcessor);
|
||||
}
|
||||
SECTION("dynSharedMemPerBlk = sharedMemPerBlock") {
|
||||
// Get potential blocksize
|
||||
HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function,
|
||||
devProp.sharedMemPerBlock, 0));
|
||||
|
||||
MaxActiveBlocksPerMultiprocessor(
|
||||
[blockSize, devProp, &function](int* numBlocks) {
|
||||
return hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, function, blockSize,
|
||||
devProp.sharedMemPerBlock);
|
||||
},
|
||||
blockSize, devProp.maxThreadsPerMultiProcessor);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipModuleUnload(module));
|
||||
}
|
||||
+103
@@ -0,0 +1,103 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
/*
|
||||
Testcase Scenarios :
|
||||
Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Positive_RangeValidation - Test
|
||||
correct execution of hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags for diffrent
|
||||
parameter values
|
||||
Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters - Test
|
||||
unsuccessful execution of hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags api when
|
||||
parameters are invalid
|
||||
*/
|
||||
#include "occupancy_common.hh"
|
||||
|
||||
TEST_CASE("Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters") {
|
||||
hipModule_t module;
|
||||
hipFunction_t function;
|
||||
int numBlocks = 0;
|
||||
int blockSize = 0;
|
||||
int gridSize = 0;
|
||||
|
||||
HIP_CHECK(hipFree(nullptr));
|
||||
|
||||
HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
|
||||
HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
|
||||
|
||||
// Get potential blocksize
|
||||
HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, 0, 0));
|
||||
|
||||
// Common negative tests
|
||||
MaxActiveBlocksPerMultiprocessorNegative(
|
||||
[&function](int* numBlocks, int blockSize, size_t dynSharedMemPerBlk) {
|
||||
return hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
numBlocks, function, blockSize, dynSharedMemPerBlk, hipOccupancyDefault);
|
||||
},
|
||||
blockSize);
|
||||
|
||||
SECTION("Flag is invalid") {
|
||||
// Only default flag is supported
|
||||
HIP_CHECK_ERROR(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
&numBlocks, function, blockSize, 0, 2),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipModuleUnload(module));
|
||||
}
|
||||
|
||||
TEST_CASE(
|
||||
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Positive_RangeValidation") {
|
||||
hipDeviceProp_t devProp;
|
||||
hipModule_t module;
|
||||
hipFunction_t function;
|
||||
int blockSize = 0;
|
||||
int gridSize = 0;
|
||||
|
||||
HIP_CHECK(hipFree(nullptr));
|
||||
|
||||
HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
|
||||
HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
|
||||
|
||||
HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
|
||||
|
||||
SECTION("dynSharedMemPerBlk = 0") {
|
||||
// Get potential blocksize
|
||||
HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, 0, 0));
|
||||
|
||||
MaxActiveBlocksPerMultiprocessor(
|
||||
[blockSize, &function](int* numBlocks) {
|
||||
return hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
numBlocks, function, blockSize, 0, hipOccupancyDefault);
|
||||
},
|
||||
blockSize, devProp.maxThreadsPerMultiProcessor);
|
||||
}
|
||||
SECTION("dynSharedMemPerBlk = sharedMemPerBlock") {
|
||||
// Get potential blocksize
|
||||
HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function,
|
||||
devProp.sharedMemPerBlock, 0));
|
||||
|
||||
MaxActiveBlocksPerMultiprocessor(
|
||||
[blockSize, devProp, &function](int* numBlocks) {
|
||||
return hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
|
||||
numBlocks, function, blockSize, devProp.sharedMemPerBlock, hipOccupancyDefault);
|
||||
},
|
||||
blockSize, devProp.maxThreadsPerMultiProcessor);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipModuleUnload(module));
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
/*
|
||||
Testcase Scenarios :
|
||||
Unit_hipModuleOccupancyMaxPotentialBlockSize_Positive_RangeValidation - Test correct execution of
|
||||
hipModuleOccupancyMaxPotentialBlockSize for diffrent parameter values
|
||||
Unit_hipModuleOccupancyMaxPotentialBlockSize_Negative_Parameters - Test unsuccessful execution of
|
||||
hipModuleOccupancyMaxPotentialBlockSize api when parameters are invalid
|
||||
*/
|
||||
#include "occupancy_common.hh"
|
||||
|
||||
TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSize_Negative_Parameters") {
|
||||
hipModule_t module;
|
||||
hipFunction_t function;
|
||||
|
||||
HIP_CHECK(hipFree(nullptr));
|
||||
|
||||
HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
|
||||
HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
|
||||
|
||||
// Common negative tests
|
||||
MaxPotentialBlockSizeNegative([&function](int* gridSize, int* blockSize) {
|
||||
return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, function, 0, 0);
|
||||
});
|
||||
|
||||
HIP_CHECK(hipModuleUnload(module));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSize_Positive_RangeValidation") {
|
||||
hipDeviceProp_t devProp;
|
||||
hipModule_t module;
|
||||
hipFunction_t function;
|
||||
|
||||
HIP_CHECK(hipFree(nullptr));
|
||||
|
||||
HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
|
||||
HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
|
||||
|
||||
HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
|
||||
|
||||
SECTION("dynSharedMemPerBlk = 0, blockSizeLimit = 0") {
|
||||
MaxPotentialBlockSize(
|
||||
[&function](int* gridSize, int* blockSize) {
|
||||
return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, function, 0, 0);
|
||||
},
|
||||
devProp.maxThreadsPerBlock);
|
||||
}
|
||||
|
||||
SECTION("dynSharedMemPerBlk = sharedMemPerBlock, blockSizeLimit = maxThreadsPerBlock") {
|
||||
MaxPotentialBlockSize(
|
||||
[&function, devProp](int* gridSize, int* blockSize) {
|
||||
return hipModuleOccupancyMaxPotentialBlockSize(
|
||||
gridSize, blockSize, function, devProp.sharedMemPerBlock, devProp.maxThreadsPerBlock);
|
||||
},
|
||||
devProp.maxThreadsPerBlock);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipModuleUnload(module));
|
||||
}
|
||||
+87
@@ -0,0 +1,87 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
/*
|
||||
Testcase Scenarios :
|
||||
Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Positive_RangeValidation - Test correct
|
||||
execution of hipModuleOccupancyMaxPotentialBlockSizeWithFlags for diffrent parameter values
|
||||
Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters - Test unsuccessful
|
||||
execution of hipModuleOccupancyMaxPotentialBlockSizeWithFlags api when parameters are invalid
|
||||
*/
|
||||
#include "occupancy_common.hh"
|
||||
|
||||
TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters") {
|
||||
hipModule_t module;
|
||||
hipFunction_t function;
|
||||
int blockSize = 0;
|
||||
int gridSize = 0;
|
||||
|
||||
HIP_CHECK(hipFree(nullptr));
|
||||
|
||||
HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
|
||||
HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
|
||||
|
||||
// Common negative tests
|
||||
MaxPotentialBlockSizeNegative([&function](int* gridSize, int* blockSize) {
|
||||
return hipModuleOccupancyMaxPotentialBlockSizeWithFlags(gridSize, blockSize, function, 0, 0,
|
||||
hipOccupancyDefault);
|
||||
});
|
||||
|
||||
SECTION("Flag is invalid") {
|
||||
// Only default flag is supported
|
||||
HIP_CHECK_ERROR(
|
||||
hipModuleOccupancyMaxPotentialBlockSizeWithFlags(&gridSize, &blockSize, function, 0, 0, 2),
|
||||
hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipModuleUnload(module));
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Positive_RangeValidation") {
|
||||
hipDeviceProp_t devProp;
|
||||
hipModule_t module;
|
||||
hipFunction_t function;
|
||||
|
||||
HIP_CHECK(hipFree(nullptr));
|
||||
|
||||
HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
|
||||
HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
|
||||
|
||||
HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
|
||||
|
||||
SECTION("dynSharedMemPerBlk = 0, blockSizeLimit = 0") {
|
||||
MaxPotentialBlockSize(
|
||||
[&function](int* gridSize, int* blockSize) {
|
||||
return hipModuleOccupancyMaxPotentialBlockSizeWithFlags(gridSize, blockSize, function, 0,
|
||||
0, hipOccupancyDefault);
|
||||
},
|
||||
devProp.maxThreadsPerBlock);
|
||||
}
|
||||
|
||||
SECTION("dynSharedMemPerBlk = sharedMemPerBlock, blockSizeLimit = maxThreadsPerBlock") {
|
||||
MaxPotentialBlockSize(
|
||||
[&function, devProp](int* gridSize, int* blockSize) {
|
||||
return hipModuleOccupancyMaxPotentialBlockSizeWithFlags(
|
||||
gridSize, blockSize, function, devProp.sharedMemPerBlock, devProp.maxThreadsPerBlock,
|
||||
hipOccupancyDefault);
|
||||
},
|
||||
devProp.maxThreadsPerBlock);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipModuleUnload(module));
|
||||
}
|
||||
@@ -66,7 +66,5 @@ template <typename F> void MaxActiveBlocksPerMultiprocessorNegative(F func, int
|
||||
SECTION("numBlocks is nullptr") {
|
||||
HIP_CHECK_ERROR(func(nullptr, blockSize, 0), hipErrorInvalidValue);
|
||||
}
|
||||
SECTION("Block size is 0") {
|
||||
HIP_CHECK_ERROR(func(&numBlocks, 0, 0), hipErrorInvalidValue);
|
||||
}
|
||||
SECTION("Block size is 0") { HIP_CHECK_ERROR(func(&numBlocks, 0, 0), hipErrorInvalidValue); }
|
||||
}
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
/*
|
||||
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
|
||||
extern "C" __global__ void SimpleKernel(int* a, int* b) {
|
||||
int tx = threadIdx.x;
|
||||
b[tx] = a[tx];
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
# Common Tests - Test independent of all platforms
|
||||
set(TEST_SRC
|
||||
warp_shfl_xor.cc
|
||||
warp_shfl.cc
|
||||
)
|
||||
|
||||
hip_add_exe_to_target(NAME WarpTest
|
||||
TEST_SRC ${TEST_SRC}
|
||||
TEST_TARGET_NAME build_tests)
|
||||
@@ -0,0 +1,84 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
|
||||
static __device__ bool deactivate_thread(const uint64_t* const active_masks) {
|
||||
const auto warp =
|
||||
cooperative_groups::tiled_partition(cooperative_groups::this_thread_block(), warpSize);
|
||||
const auto block = cooperative_groups::this_thread_block();
|
||||
const auto warps_per_block = (block.size() + warpSize - 1) / warpSize;
|
||||
const auto block_rank = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
|
||||
const auto idx = block_rank * warps_per_block + block.thread_rank() / warpSize;
|
||||
|
||||
return !(active_masks[idx] & (static_cast<uint64_t>(1) << warp.thread_rank()));
|
||||
}
|
||||
|
||||
static inline std::mt19937& GetRandomGenerator() {
|
||||
static std::mt19937 mt(std::random_device{}());
|
||||
return mt;
|
||||
}
|
||||
|
||||
template <typename T> static inline T GenerateRandomInteger(const T min, const T max) {
|
||||
std::uniform_int_distribution<T> dist(min, max);
|
||||
return dist(GetRandomGenerator());
|
||||
}
|
||||
|
||||
template <typename T> static inline T GenerateRandomReal(const T min, const T max) {
|
||||
std::uniform_real_distribution<T> dist(min, max);
|
||||
return dist(GetRandomGenerator());
|
||||
}
|
||||
|
||||
inline int generate_width(int warp_size) {
|
||||
int exponent = 0;
|
||||
while (warp_size >>= 1) {
|
||||
++exponent;
|
||||
}
|
||||
|
||||
return GENERATE_COPY(map([](int e) { return 1 << e; }, range(1, exponent + 1)));
|
||||
}
|
||||
|
||||
inline uint64_t get_active_mask(unsigned int warp_id, unsigned int warp_size) {
|
||||
uint64_t active_mask = 0;
|
||||
switch (warp_id % 5) {
|
||||
case 0: // even threads in the warp
|
||||
active_mask = 0xAAAAAAAAAAAAAAAA;
|
||||
break;
|
||||
case 1: // odd threads in the warp
|
||||
active_mask = 0x5555555555555555;
|
||||
break;
|
||||
case 2: // first half of the warp
|
||||
for (int i = 0; i < warp_size / 2; i++) {
|
||||
active_mask = active_mask | (static_cast<uint64_t>(1) << i);
|
||||
}
|
||||
break;
|
||||
case 3: // second half of the warp
|
||||
for (int i = warp_size / 2; i < warp_size; i++) {
|
||||
active_mask = active_mask | (static_cast<uint64_t>(1) << i);
|
||||
}
|
||||
break;
|
||||
case 4: // all threads
|
||||
active_mask = 0xFFFFFFFFFFFFFFFF;
|
||||
break;
|
||||
}
|
||||
return active_mask;
|
||||
}
|
||||
@@ -0,0 +1,121 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "warp_shfl_common.hh"
|
||||
|
||||
#include <bitset>
|
||||
|
||||
/**
|
||||
* @addtogroup shfl shfl
|
||||
* @{
|
||||
* @ingroup DeviceLanguageTest
|
||||
* `T __shfl(T var, int src_lane, int width = warpSize)` -
|
||||
* Contains unit test for warp shfl function
|
||||
*/
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
template <typename T>
|
||||
__global__ void shfl(T* const out, const T* const in, const uint64_t* const active_masks,
|
||||
const uint8_t* const src_lanes, const int width) {
|
||||
if (deactivate_thread(active_masks)) {
|
||||
return;
|
||||
}
|
||||
const auto grid = cg::this_grid();
|
||||
const auto block = cg::this_thread_block();
|
||||
T var = in[grid.thread_rank()];
|
||||
out[grid.thread_rank()] = __shfl(var, src_lanes[block.thread_rank() % width], width);
|
||||
}
|
||||
|
||||
template <typename T> class WarpShfl : public WarpShflTest<WarpShfl<T>, T> {
|
||||
public:
|
||||
void launch_kernel(T* const arr_dev, T* const input_dev, const uint64_t* const active_masks) {
|
||||
width_ = generate_width(this->warp_size_);
|
||||
INFO("Width: " << width_);
|
||||
const auto alloc_size = width_ * sizeof(uint8_t);
|
||||
LinearAllocGuard<uint8_t> src_lanes_dev(LinearAllocs::hipMalloc, alloc_size);
|
||||
src_lanes_.resize(width_);
|
||||
std::generate(src_lanes_.begin(), src_lanes_.end(),
|
||||
[this] { return GenerateRandomInteger(0, static_cast<int>(2 * width_)); });
|
||||
|
||||
HIP_CHECK(hipMemcpy(src_lanes_dev.ptr(), src_lanes_.data(), alloc_size, hipMemcpyHostToDevice));
|
||||
shfl<<<this->grid_.grid_dim_, this->grid_.block_dim_>>>(arr_dev, input_dev, active_masks,
|
||||
src_lanes_dev.ptr(), width_);
|
||||
}
|
||||
|
||||
void validate(const T* const arr, const T* const input) {
|
||||
ArrayAllOf(arr, this->grid_.thread_count_, [this, &input](unsigned int i) -> std::optional<T> {
|
||||
const auto rank_in_block = this->grid_.thread_rank_in_block(i).value();
|
||||
const auto rank_in_warp = rank_in_block % this->warp_size_;
|
||||
const auto rank_in_partition = rank_in_block % width_;
|
||||
const int src_lane = src_lanes_[rank_in_partition] % width_;
|
||||
const int src_offset = src_lane - rank_in_partition;
|
||||
|
||||
const auto mask_idx = this->warps_in_block_ * (i / this->grid_.threads_in_block_count_) +
|
||||
rank_in_block / this->warp_size_;
|
||||
const std::bitset<sizeof(uint64_t) * 8> active_mask(this->active_masks_[mask_idx]);
|
||||
|
||||
if (!active_mask.test(rank_in_warp) || (!active_mask.test((rank_in_warp + src_offset))) ||
|
||||
(rank_in_block + src_offset >= this->grid_.threads_in_block_count_)) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
return input[i + src_offset];
|
||||
});
|
||||
};
|
||||
|
||||
private:
|
||||
std::vector<uint8_t> src_lanes_;
|
||||
int width_;
|
||||
};
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Validates the warp shuffle behavior for all valid width sizes {2, 4, 8, 16, 32,
|
||||
* 64(if supported)} for generated shuffle target lanes. The threads are deactivated based on the
|
||||
* passed active mask. The test is run for all overloads of shfl.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/warp/warp_shfl.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
* - Device supports warp shuffle
|
||||
*/
|
||||
TEMPLATE_TEST_CASE("Unit_Warp_Shfl_Positive_Basic", "", int, unsigned int, long, unsigned long,
|
||||
long long, unsigned long long, float, double) {
|
||||
int device;
|
||||
hipDeviceProp_t device_properties;
|
||||
HIP_CHECK(hipGetDevice(&device));
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
||||
|
||||
if (!device_properties.arch.hasWarpShuffle) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support Warp Shuffle!");
|
||||
return;
|
||||
}
|
||||
|
||||
SECTION("Shfl with specified active mask and input values") {
|
||||
WarpShfl<TestType>().run(false);
|
||||
}
|
||||
|
||||
SECTION("Shfl with random active mask and input values") {
|
||||
WarpShfl<TestType>().run(true);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,114 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "warp_common.hh"
|
||||
|
||||
#include <cpu_grid.h>
|
||||
#include <resource_guards.hh>
|
||||
#include <utils.hh>
|
||||
|
||||
template <typename Derived, typename T> class WarpShflTest {
|
||||
public:
|
||||
WarpShflTest() : warp_size_{get_warp_size()} {}
|
||||
|
||||
void run(bool random = false) {
|
||||
const auto blocks = GenerateBlockDimensionsForShuffle();
|
||||
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
|
||||
const auto threads = GenerateThreadDimensionsForShuffle();
|
||||
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
|
||||
grid_ = CPUGrid(blocks, threads);
|
||||
|
||||
const auto alloc_size = grid_.thread_count_ * sizeof(T);
|
||||
LinearAllocGuard<T> input_dev(LinearAllocs::hipMalloc, alloc_size);
|
||||
LinearAllocGuard<T> input(LinearAllocs::hipHostMalloc, alloc_size);
|
||||
LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
|
||||
LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
|
||||
HIP_CHECK(hipMemset(arr_dev.ptr(), 0, alloc_size));
|
||||
|
||||
warps_in_block_ = (grid_.threads_in_block_count_ + warp_size_ - 1) / warp_size_;
|
||||
const auto warps_in_grid = warps_in_block_ * grid_.block_count_;
|
||||
LinearAllocGuard<uint64_t> active_masks_dev(LinearAllocs::hipMalloc,
|
||||
warps_in_grid * sizeof(uint64_t));
|
||||
active_masks_.resize(warps_in_grid);
|
||||
|
||||
generate_input(input.ptr(), random);
|
||||
|
||||
HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks_.data(),
|
||||
warps_in_grid * sizeof(uint64_t), hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(input_dev.ptr(), input.ptr(), alloc_size, hipMemcpyHostToDevice));
|
||||
cast_to_derived().launch_kernel(arr_dev.ptr(), input_dev.ptr(), active_masks_dev.ptr());
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
cast_to_derived().validate(arr.ptr(), input.ptr());
|
||||
}
|
||||
|
||||
private:
|
||||
int get_warp_size() const {
|
||||
int current_dev = -1;
|
||||
HIP_CHECK(hipGetDevice(¤t_dev));
|
||||
int warp_size = 0u;
|
||||
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
|
||||
return warp_size;
|
||||
}
|
||||
|
||||
void generate_input(T* input, bool random) {
|
||||
if (random) {
|
||||
std::generate(active_masks_.begin(), active_masks_.end(), [] {
|
||||
return GenerateRandomInteger<unsigned long long>(0ul, std::numeric_limits<uint64_t>().max());
|
||||
});
|
||||
|
||||
if constexpr (std::is_same_v<float, T> || std::is_same_v<double, T>) {
|
||||
std::generate_n(input, grid_.thread_count_, [] {
|
||||
return static_cast<T>(
|
||||
GenerateRandomReal(std::numeric_limits<T>().min(), std::numeric_limits<T>().max()));
|
||||
});
|
||||
} else {
|
||||
std::generate_n(input, grid_.thread_count_, [] {
|
||||
return static_cast<T>(GenerateRandomInteger(std::numeric_limits<T>().min(),
|
||||
std::numeric_limits<T>().max()));
|
||||
});
|
||||
}
|
||||
} else {
|
||||
unsigned long long int i = 0;
|
||||
std::generate(active_masks_.begin(), active_masks_.end(),
|
||||
[this, &i]() { return get_active_mask(i++, warp_size_); });
|
||||
|
||||
i = 0;
|
||||
std::generate_n(input, grid_.thread_count_, [&i]() {
|
||||
if (static_cast<T>(i) > std::numeric_limits<T>().max())
|
||||
i = 0;
|
||||
else
|
||||
i++;
|
||||
return static_cast<T>(i);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Derived& cast_to_derived() { return reinterpret_cast<Derived&>(*this); }
|
||||
|
||||
protected:
|
||||
const int warp_size_;
|
||||
CPUGrid grid_;
|
||||
unsigned int warps_in_block_;
|
||||
std::vector<uint64_t> active_masks_;
|
||||
};
|
||||
@@ -0,0 +1,118 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "warp_shfl_common.hh"
|
||||
|
||||
#include <bitset>
|
||||
|
||||
/**
|
||||
* @addtogroup shfl_xor shfl_xor
|
||||
* @{
|
||||
* @ingroup DeviceLanguageTest
|
||||
* `T __shfl_xor(T var, int lane_mask, int width = warpSize)` -
|
||||
* Contains unit test for warp shfl_xor function
|
||||
*/
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
template <typename T>
|
||||
__global__ void shfl_xor(T* const out, const T* const in, const uint64_t* const active_masks,
|
||||
const int lane_mask, const int width) {
|
||||
if (deactivate_thread(active_masks)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto grid = cg::this_grid();
|
||||
T var = in[grid.thread_rank()];
|
||||
out[grid.thread_rank()] = __shfl_xor(var, lane_mask, width);
|
||||
}
|
||||
|
||||
template <typename T> class WarpShflXOR : public WarpShflTest<WarpShflXOR<T>, T> {
|
||||
public:
|
||||
void launch_kernel(T* const arr_dev, T* const input_dev, const uint64_t* const active_masks) {
|
||||
width_ = generate_width(this->warp_size_);
|
||||
INFO("Width: " << width_);
|
||||
lane_mask_ = GENERATE_COPY(range(0, this->warp_size_));
|
||||
INFO("Lane mask: " << lane_mask_);
|
||||
shfl_xor<<<this->grid_.grid_dim_, this->grid_.block_dim_>>>(arr_dev, input_dev, active_masks,
|
||||
lane_mask_, width_);
|
||||
}
|
||||
|
||||
void validate(const T* const arr, const T* const input) {
|
||||
ArrayAllOf(arr, this->grid_.thread_count_, [this, &input](unsigned int i) -> std::optional<T> {
|
||||
const auto rank_in_block = this->grid_.thread_rank_in_block(i).value();
|
||||
const auto rank_in_warp = rank_in_block % this->warp_size_;
|
||||
const int warp_target = rank_in_warp ^ this->lane_mask_;
|
||||
const int target_offset = warp_target - rank_in_warp;
|
||||
const auto mask_idx = this->warps_in_block_ * (i / this->grid_.threads_in_block_count_) +
|
||||
rank_in_block / this->warp_size_;
|
||||
const std::bitset<sizeof(uint64_t) * 8> active_mask(this->active_masks_[mask_idx]);
|
||||
|
||||
const auto target_partition = warp_target / width_;
|
||||
const auto partition_rank = rank_in_warp / width_;
|
||||
if (!active_mask.test(rank_in_warp) ||
|
||||
(target_partition <= partition_rank && !active_mask.test(rank_in_warp + target_offset)) ||
|
||||
(target_partition <= partition_rank &&
|
||||
rank_in_block + target_offset >= this->grid_.threads_in_block_count_)) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
return target_partition > partition_rank ? input[i] : input[i + target_offset];
|
||||
});
|
||||
};
|
||||
|
||||
private:
|
||||
int lane_mask_;
|
||||
int width_;
|
||||
};
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Validates the warp shuffle xor behavior for all valid width sizes {2, 4, 8, 16, 32,
|
||||
* 64(if supported)} for mask values of [0, width). The threads are deactivated based on the
|
||||
* passed active mask. The test is run for all overloads of shfl_xor.
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/warp/warp_shfl_xor.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.2
|
||||
* - Device supports warp shuffle
|
||||
*/
|
||||
TEMPLATE_TEST_CASE("Unit_Warp_Shfl_XOR_Positive_Basic", "", int, unsigned int, long, unsigned long,
|
||||
long long, unsigned long long, float, double) {
|
||||
int device;
|
||||
hipDeviceProp_t device_properties;
|
||||
HIP_CHECK(hipGetDevice(&device));
|
||||
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
|
||||
|
||||
if (!device_properties.arch.hasWarpShuffle) {
|
||||
HipTest::HIP_SKIP_TEST("Device doesn't support Warp Shuffle!");
|
||||
return;
|
||||
}
|
||||
|
||||
SECTION("Shfl Xor with specified active mask and input values") {
|
||||
WarpShflXOR<TestType>().run(false);
|
||||
}
|
||||
|
||||
SECTION("Shfl Xor with random active mask and input values") {
|
||||
WarpShflXOR<TestType>().run(true);
|
||||
}
|
||||
}
|
||||
새 이슈에서 참조
사용자 차단