Maneesh Gupta
2023-12-20 10:24:27 +00:00
99개의 변경된 파일9888개의 추가작업 그리고 2605개의 파일을 삭제
+58
파일 보기
@@ -54,6 +54,8 @@
"Unit_hipFuncSetAttribute_Positive_MaxDynamicSharedMemorySize_Not_Supported",
"Unit_hipFuncSetAttribute_Positive_PreferredSharedMemoryCarveout_Not_Supported",
"Unit_hipOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters",
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters",
"Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters",
"Unit_hipGraphMemcpyNodeSetParamsToSymbol_Positive_Basic",
"Unit_hipGraphExecMemcpyNodeSetParamsToSymbol_Positive_Basic",
"Unit_hipGraphExecMemcpyNodeSetParamsFromSymbol_Positive_Basic",
@@ -183,6 +185,62 @@
"Unit_hipMemUnmap_negative",
"=== SWDEV-432556,SWDEV-434211:Below test randomly failing in stress test ===",
"Unit_hipDeviceGetUuid_From_RocmInfo",
"=== SWDEV-434171: Below tests took long time to complete in stress test on 17/11/23 ===",
"Unit_Warp_Shfl_Positive_Basic - int",
"Unit_Warp_Shfl_Positive_Basic - unsigned int",
"Unit_Warp_Shfl_Positive_Basic - long",
"Unit_Warp_Shfl_Positive_Basic - unsigned long",
"Unit_Warp_Shfl_Positive_Basic - long long",
"Unit_Warp_Shfl_Positive_Basic - unsigned long long",
"Unit_Warp_Shfl_Positive_Basic - float",
"Unit_Warp_Shfl_Positive_Basic - double",
"Unit_Warp_Shfl_XOR_Positive_Basic - int",
"Unit_Warp_Shfl_XOR_Positive_Basic - unsigned int",
"Unit_Warp_Shfl_XOR_Positive_Basic - long",
"Unit_Warp_Shfl_XOR_Positive_Basic - unsigned long",
"Unit_Warp_Shfl_XOR_Positive_Basic - long long",
"Unit_Warp_Shfl_XOR_Positive_Basic - unsigned long long",
"Unit_Warp_Shfl_XOR_Positive_Basic - float",
"Unit_Warp_Shfl_XOR_Positive_Basic - double",
"=== SWDEV-434878: Below tests failed in stress test on 24/11/23 ===",
"Unit_hipGraphUpload_Negative_Parameters",
"Unit_hipModuleOccupancyMaxPotentialBlockSize_Negative_Parameters",
"Unit_hipModuleOccupancyMaxPotentialBlockSize_Positive_RangeValidation",
"Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Positive_RangeValidation",
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters",
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Positive_RangeValidation",
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Positive_RangeValidation",
"=== SWDEV-435667: Below tests failing randomly in stress test on 01/12/23 ===",
"Unit_atomicExch_Positive_Same_Address_Compile_Time - int",
"Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned int",
"Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long",
"Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long long",
"Unit_atomicExch_Positive_Same_Address_Compile_Time - float",
"Unit_atomicExch_Positive_Same_Address_Compile_Time - double",
"Unit_atomicExch_Positive_Multi_Kernel - int",
"Unit_atomicExch_Positive_Multi_Kernel - unsigned int",
"Unit_atomicExch_Positive_Multi_Kernel - unsigned long",
"Unit_atomicExch_Positive_Multi_Kernel - unsigned long long",
"Unit_atomicExch_Positive_Multi_Kernel - float",
"Unit_atomicExch_Positive_Multi_Kernel - double",
"Unit_atomicExch_system_Positive_Peer_GPUs - int",
"Unit_atomicExch_system_Positive_Peer_GPUs - unsigned int",
"Unit_atomicExch_system_Positive_Peer_GPUs - unsigned long",
"Unit_atomicExch_system_Positive_Peer_GPUs - unsigned long long",
"Unit_atomicExch_system_Positive_Peer_GPUs - float",
"Unit_atomicExch_system_Positive_Peer_GPUs - double",
"Unit_atomicExch_system_Positive_Host_And_GPU - int",
"Unit_atomicExch_system_Positive_Host_And_GPU - unsigned int",
"Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long",
"Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long long",
"Unit_atomicExch_system_Positive_Host_And_GPU - float",
"Unit_atomicExch_system_Positive_Host_And_GPU - double",
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - int",
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned int",
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned long",
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned long long",
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - float",
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - double",
#endif
#if defined VEGA20
"=== SWDEV-419112 Below tests fail in stress test on 29/08/23 ===",
+58
파일 보기
@@ -119,6 +119,8 @@
"Unit_hipFuncSetAttribute_Positive_MaxDynamicSharedMemorySize_Not_Supported",
"Unit_hipFuncSetAttribute_Positive_PreferredSharedMemoryCarveout_Not_Supported",
"Unit_hipOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters",
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters",
"Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters",
"Unit_hipGraphMemcpyNodeSetParamsToSymbol_Positive_Basic",
"Unit_hipGraphExecMemcpyNodeSetParamsToSymbol_Positive_Basic",
"Unit_hipGraphMemcpyNodeSetParamsFromSymbol_Positive_Basic",
@@ -282,6 +284,62 @@
"Unit_hipMemSetAccess_MultiProc",
"Unit_hipMemSetAccess_negative",
"Unit_hipMemUnmap_negative",
"=== SWDEV-434171: Below tests took long time to complete in stress test on 17/11/23 ===",
"Unit_Warp_Shfl_Positive_Basic - int",
"Unit_Warp_Shfl_Positive_Basic - unsigned int",
"Unit_Warp_Shfl_Positive_Basic - long",
"Unit_Warp_Shfl_Positive_Basic - unsigned long",
"Unit_Warp_Shfl_Positive_Basic - long long",
"Unit_Warp_Shfl_Positive_Basic - unsigned long long",
"Unit_Warp_Shfl_Positive_Basic - float",
"Unit_Warp_Shfl_Positive_Basic - double",
"Unit_Warp_Shfl_XOR_Positive_Basic - int",
"Unit_Warp_Shfl_XOR_Positive_Basic - unsigned int",
"Unit_Warp_Shfl_XOR_Positive_Basic - long",
"Unit_Warp_Shfl_XOR_Positive_Basic - unsigned long",
"Unit_Warp_Shfl_XOR_Positive_Basic - long long",
"Unit_Warp_Shfl_XOR_Positive_Basic - unsigned long long",
"Unit_Warp_Shfl_XOR_Positive_Basic - float",
"Unit_Warp_Shfl_XOR_Positive_Basic - double",
"=== SWDEV-434878: Below tests failed in stress test on 24/11/23 ===",
"Unit_hipGraphUpload_Negative_Parameters",
"Unit_hipModuleOccupancyMaxPotentialBlockSize_Negative_Parameters",
"Unit_hipModuleOccupancyMaxPotentialBlockSize_Positive_RangeValidation",
"Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Positive_RangeValidation",
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters",
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Positive_RangeValidation",
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Positive_RangeValidation",
"=== SWDEV-435667: Below tests failing randomly in stress test on 01/12/23 ===",
"Unit_atomicExch_Positive_Same_Address_Compile_Time - int",
"Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned int",
"Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long",
"Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long long",
"Unit_atomicExch_Positive_Same_Address_Compile_Time - float",
"Unit_atomicExch_Positive_Same_Address_Compile_Time - double",
"Unit_atomicExch_Positive_Multi_Kernel - int",
"Unit_atomicExch_Positive_Multi_Kernel - unsigned int",
"Unit_atomicExch_Positive_Multi_Kernel - unsigned long",
"Unit_atomicExch_Positive_Multi_Kernel - unsigned long long",
"Unit_atomicExch_Positive_Multi_Kernel - float",
"Unit_atomicExch_Positive_Multi_Kernel - double",
"Unit_atomicExch_system_Positive_Peer_GPUs - int",
"Unit_atomicExch_system_Positive_Peer_GPUs - unsigned int",
"Unit_atomicExch_system_Positive_Peer_GPUs - unsigned long",
"Unit_atomicExch_system_Positive_Peer_GPUs - unsigned long long",
"Unit_atomicExch_system_Positive_Peer_GPUs - float",
"Unit_atomicExch_system_Positive_Peer_GPUs - double",
"Unit_atomicExch_system_Positive_Host_And_GPU - int",
"Unit_atomicExch_system_Positive_Host_And_GPU - unsigned int",
"Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long",
"Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long long",
"Unit_atomicExch_system_Positive_Host_And_GPU - float",
"Unit_atomicExch_system_Positive_Host_And_GPU - double",
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - int",
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned int",
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned long",
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned long long",
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - float",
"Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - double",
#endif
"End of json"
]
+9 -1
파일 보기
@@ -44,6 +44,14 @@
"Grid_Group_Getters_Via_Non_Member_Functions_Positive_Basic",
"Grid_Group_Sync_Positive_Basic",
"dynamic_loading_device_kernels_from_library",
"Unit_tiled_partition"
"Unit_tiled_partition",
"Unit_atomicExch_Positive_Same_Address_Compile_Time - int",
"Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned int",
"Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long long",
"Unit_atomicExch_Positive_Same_Address_Compile_Time - float",
"Unit_atomicExch_system_Positive_Host_And_GPU - int",
"Unit_atomicExch_system_Positive_Host_And_GPU - unsigned int",
"Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long long",
"Unit_atomicExch_system_Positive_Host_And_GPU - float"
]
}
+3 -3
파일 보기
@@ -30,9 +30,9 @@ int main(int argc, char** argv) {
| Opt(cmd_options.progress)
["-P"]["--progress"]
("Show progress bar when running performance tests")
| Opt(cmd_options.extended_run)
["-E"]["--extended-run"]
("TODO: Description goes here")
| Opt(cmd_options.cg_extended_run, "cg_extened_run")
["-E"]["--cg-extended-run"]
("Number of iterations used for cooperative groups sync tests (default: 5)")
;
// clang-format on
+2 -2
파일 보기
@@ -23,11 +23,11 @@ THE SOFTWARE.
#pragma once
struct CmdOptions {
int iterations = 1000;
int iterations = 10;
int warmups = 100;
int cg_extended_run = 5;
bool no_display = false;
bool progress = false;
bool extended_run = false;
};
extern CmdOptions cmd_options;
+4
파일 보기
@@ -78,6 +78,7 @@ struct CPUGrid {
unsigned int thread_count_;
};
/* Generate dimensions for 1D, 2D and 3D blocks of threads */
inline dim3 GenerateThreadDimensions() {
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, 0));
@@ -99,6 +100,7 @@ inline dim3 GenerateThreadDimensions() {
dim3(props.warpSize + 1, 3, 3));
}
/* Generate dimensions for 1D, 2D and 3D grids of blocks */
inline dim3 GenerateBlockDimensions() {
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, 0));
@@ -116,6 +118,7 @@ inline dim3 GenerateBlockDimensions() {
dim3(5, 5, 5));
}
/* Generate dimensions for 1D, 2D and 3D blocks of threads - reduced set */
inline dim3 GenerateThreadDimensionsForShuffle() {
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, 0));
@@ -136,6 +139,7 @@ inline dim3 GenerateThreadDimensionsForShuffle() {
dim3(props.warpSize + 1, 3, 3));
}
/* Generate dimensions for 1D, 2D and 3D grids of blocks - reduced set */
inline dim3 GenerateBlockDimensionsForShuffle() {
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, 0));
+13
파일 보기
@@ -102,6 +102,19 @@ THE SOFTWARE.
} \
}
// Check that an expression, errorExpr, evaluates to the expected error_t, expectedError.
#define HIPRTC_CHECK_ERROR(errorExpr, expectedError) \
{ \
auto localError = errorExpr; \
INFO("Matching Errors: " \
<< "\n Expected Error: " << hiprtcGetErrorString(expectedError) \
<< "\n Expected Code: " << expectedError << '\n' \
<< " Actual Error: " << hiprtcGetErrorString(localError) \
<< "\n Actual Code: " << localError << "\nStr: " << #errorExpr \
<< "\n In File: " << __FILE__ << "\n At line: " << __LINE__); \
REQUIRE(localError == expectedError); \
}
// Although its assert, it will be evaluated at runtime
#define HIP_ASSERT(x) \
{ REQUIRE((x)); }
+49 -2
파일 보기
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
// Test groups are named based on the group names from hip_api_runtime.h, with adding "Test" suffix
/**
@@ -95,8 +97,46 @@ THE SOFTWARE.
/**
* @defgroup KernelTest Kernel Functions Management
* @{
* This section describes the various kernel functions invocation.
* @}
*/
/**
* @defgroup AtomicsTest Device Atomics
* @{
* This section describes the various kernel functions invocation.
* This section describes tests for the Device Atomic APIs.
* @}
*/
/**
* @addtogroup atomicExch atomicExch
* @{
* @ingroup AtomicsTest
*/
/**
* Test Description
* ------------------------
* - Compiles atomicExch with invalid parameters.
* - Compiles the source with specialized Python tool.
* -# Utilizes sub-process to invoke compilation of faulty source.
* -# Performs post-processing of compiler output and counts errors.
* Test source
* ------------------------
* - unit/atomics/CMakeLists.txt
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_atomicExch_Negative_Parameters") {}
/**
* End doxygen group atomicExch.
* @}
*/
/**
* End doxygen group AtomicsTest.
* @}
*/
@@ -115,7 +155,14 @@ THE SOFTWARE.
* @}
*/
/**
/**
* @defgroup PerformanceTest Performance tests
* @{
* This section describes performance tests for the target API groups and use-cases.
* @}
*/
/**
* @defgroup ShflTest warp shuffle function Management
* @{
* This section describes the warp shuffle types & functions of HIP runtime API.
+1
파일 보기
@@ -34,6 +34,7 @@ THE SOFTWARE.
#include <resource_guards.hh>
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#pragma clang diagnostic ignored "-Wunused-parameter"
#pragma clang diagnostic ignored "-Wunused-function"
#if defined(_WIN32)
+81 -16
파일 보기
@@ -29,10 +29,30 @@ enum class LinearAllocs {
hipHostMalloc,
hipMalloc,
hipMallocManaged,
noAlloc
};
inline std::string to_string(const LinearAllocs allocation_type) {
switch (allocation_type) {
case LinearAllocs::malloc:
return "host pageable";
case LinearAllocs::mallocAndRegister:
return "registered";
case LinearAllocs::hipHostMalloc:
return "host pinned";
case LinearAllocs::hipMalloc:
return "device malloc";
case LinearAllocs::hipMallocManaged:
return "managed";
default:
return "unknown alloc type";
}
}
template <typename T> class LinearAllocGuard {
public:
LinearAllocGuard() = default;
LinearAllocGuard(const LinearAllocs allocation_type, const size_t size,
const unsigned int flags = 0u)
: allocation_type_{allocation_type} {
@@ -55,15 +75,36 @@ template <typename T> class LinearAllocGuard {
case LinearAllocs::hipMallocManaged:
HIP_CHECK(hipMallocManaged(reinterpret_cast<void**>(&ptr_), size, flags ? flags : 1u));
host_ptr_ = ptr_;
break;
case LinearAllocs::noAlloc:
break;
}
}
LinearAllocGuard(const LinearAllocGuard&) = delete;
LinearAllocGuard(LinearAllocGuard&&) = delete;
LinearAllocGuard(LinearAllocGuard&& o)
: allocation_type_{o.allocation_type_}, ptr_{o.ptr_}, host_ptr_{o.host_ptr_} {
o.allocation_type_ = LinearAllocs::noAlloc;
o.ptr_ = nullptr;
o.host_ptr_ = nullptr;
}
LinearAllocGuard& operator=(LinearAllocGuard&& o) {
allocation_type_ = o.allocation_type_;
ptr_ = o.ptr_;
host_ptr_ = o.host_ptr_;
o.allocation_type_ = LinearAllocs::noAlloc;
o.ptr_ = nullptr;
o.host_ptr_ = nullptr;
}
~LinearAllocGuard() {
// No Catch macros, don't want to possibly throw in the destructor
switch (allocation_type_) {
case LinearAllocs::noAlloc:
break;
case LinearAllocs::malloc:
free(ptr_);
break;
@@ -85,7 +126,7 @@ template <typename T> class LinearAllocGuard {
T* host_ptr() const { return host_ptr_; }
private:
const LinearAllocs allocation_type_;
LinearAllocs allocation_type_ = LinearAllocs::noAlloc;
T* ptr_ = nullptr;
T* host_ptr_ = nullptr;
};
@@ -200,7 +241,10 @@ enum class Streams { nullstream, perThread, created, withFlags, withPriority };
class StreamGuard {
public:
StreamGuard(const Streams stream_type, unsigned int flags = hipStreamDefault, int priority = 0) : stream_type_{stream_type}, flags_{flags}, priority_{priority} {
StreamGuard() = default;
StreamGuard(const Streams stream_type, unsigned int flags = hipStreamDefault, int priority = 0)
: stream_type_{stream_type}, flags_{flags}, priority_{priority} {
switch (stream_type_) {
case Streams::nullstream:
stream_ = nullptr;
@@ -219,7 +263,28 @@ class StreamGuard {
}
StreamGuard(const StreamGuard&) = delete;
StreamGuard(StreamGuard&&) = delete;
StreamGuard(StreamGuard&& o)
: stream_type_{o.stream_type_}, flags_{o.flags_}, priority_{o.priority_}, stream_{o.stream_} {
o.stream_type_ = Streams::nullstream;
o.flags_ = 0u;
o.priority_ = 0;
o.stream_ = nullptr;
}
StreamGuard& operator=(StreamGuard&& o) {
stream_type_ = o.stream_type_;
flags_ = o.flags_;
priority_ = o.priority_;
stream_ = o.stream_;
o.stream_type_ = Streams::nullstream;
o.flags_ = 0u;
o.priority_ = 0;
o.stream_ = nullptr;
return *this;
}
~StreamGuard() {
if (stream_type_ == Streams::created) {
@@ -230,23 +295,23 @@ class StreamGuard {
hipStream_t stream() const { return stream_; }
private:
const Streams stream_type_;
unsigned int flags_;
int priority_;
hipStream_t stream_;
Streams stream_type_ = Streams::nullstream;
unsigned int flags_ = 0u;
int priority_ = 0;
hipStream_t stream_ = nullptr;
};
class EventsGuard {
public:
public:
EventsGuard(size_t N) : events_(N) {
for (auto &e : events_) HIP_CHECK(hipEventCreate(&e));
for (auto& e : events_) HIP_CHECK(hipEventCreate(&e));
}
EventsGuard(const EventsGuard&) = delete;
EventsGuard(EventsGuard&&) = delete;
~EventsGuard() {
for (auto &e : events_) static_cast<void>(hipEventDestroy(e));
for (auto& e : events_) static_cast<void>(hipEventDestroy(e));
}
hipEvent_t& operator[](int index) { return events_[index]; }
@@ -255,21 +320,21 @@ public:
std::vector<hipEvent_t>& event_list() { return events_; }
private:
private:
std::vector<hipEvent_t> events_;
};
class StreamsGuard {
public:
public:
StreamsGuard(size_t N) : streams_(N) {
for (auto &s : streams_) HIP_CHECK(hipStreamCreate(&s));
for (auto& s : streams_) HIP_CHECK(hipStreamCreate(&s));
}
StreamsGuard(const StreamsGuard&) = delete;
StreamsGuard(StreamsGuard&&) = delete;
~StreamsGuard() {
for (auto &s : streams_) static_cast<void>(hipStreamDestroy(s));
for (auto& s : streams_) static_cast<void>(hipStreamDestroy(s));
}
hipStream_t& operator[](int index) { return streams_[index]; }
@@ -278,6 +343,6 @@ public:
std::vector<hipStream_t>& stream_list() { return streams_; }
private:
private:
std::vector<hipStream_t> streams_;
};
+1
파일 보기
@@ -18,5 +18,6 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
add_subdirectory(stream)
add_subdirectory(event)
add_subdirectory(example)
+63
파일 보기
@@ -0,0 +1,63 @@
# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
if(HIP_PLATFORM MATCHES "amd")
set(TEST_SRC
hipStreamWaitEvent.cc
hipStreamGetFlags.cc
hipStreamGetPriority.cc
hipExtStreamCreateWithCUMask.cc
hipExtStreamGetCUMask.cc
hipStreamAddCallback.cc
hipStreamWaitValue.cc
hipStreamWriteValue.cc
hipMallocAsync.cc
hipFreeAsync.cc
hipMemPoolCreate.cc
hipMemPoolDestroy.cc
hipMemPoolTrimTo.cc
hipMemPoolSetAttribute.cc
hipMemPoolGetAttribute.cc
hipMemPoolSetAccess.cc
hipMallocFromPoolAsync.cc
hipMemPoolExportToShareableHandle.cc
hipMemPoolImportFromShareableHandle.cc
hipMemPoolExportPointer.cc
hipMemPoolImportPointer.cc
hipStreamBasic.cc
)
else()
set(TEST_SRC
hipStreamWaitEvent.cc
hipStreamGetFlags.cc
hipStreamGetPriority.cc
hipStreamAddCallback.cc
hipStreamWaitValue.cc
hipStreamWriteValue.cc
hipMallocAsync.cc
hipFreeAsync.cc
hipStreamBasic.cc
)
endif()
hip_add_exe_to_target(NAME StreamPerformance
TEST_SRC ${TEST_SRC}
TEST_TARGET_NAME build_tests
COMPILE_OPTIONS -std=c++17)
+65
파일 보기
@@ -0,0 +1,65 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <performance_common.hh>
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
* Contains performance tests for all stream management HIP APIs.
*/
class ExtStreamCreateWithCUMaskBenchmark : public Benchmark<ExtStreamCreateWithCUMaskBenchmark> {
public:
void operator()() {
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, 0));
std::vector<uint32_t> cu_mask(props.multiProcessorCount, 0);
hipStream_t stream{};
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipExtStreamCreateWithCUMask(&stream, cu_mask.size(), cu_mask.data()));
}
HIP_CHECK(hipStreamDestroy(stream));
}
};
static void RunBenchmark() {
ExtStreamCreateWithCUMaskBenchmark benchmark;
benchmark.Run();
}
/**
* Test Description
* ------------------------
* - Executes `hipExtStreamCreateWithCUMask`.
* Test source
* ------------------------
* - performance/stream/hipExtStreamCreateWithCUMask.cc
* Test requirements
* ------------------------
* - Platform specific (AMD)
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipExtStreamCreateWithCUMask") {
RunBenchmark();
}
+67
파일 보기
@@ -0,0 +1,67 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <performance_common.hh>
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class ExtStreamGetCUMaskBenchmark : public Benchmark<ExtStreamGetCUMaskBenchmark> {
public:
void operator()() {
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, 0));
std::vector<uint32_t> cu_mask(props.multiProcessorCount, 0);
hipStream_t stream{};
HIP_CHECK(hipExtStreamCreateWithCUMask(&stream, cu_mask.size(), cu_mask.data()));
std::vector<uint32_t> new_cu_mask(cu_mask.size(), 0);
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipExtStreamGetCUMask(stream, new_cu_mask.size(), new_cu_mask.data()));
}
HIP_CHECK(hipStreamDestroy(stream));
}
};
static void RunBenchmark() {
ExtStreamGetCUMaskBenchmark benchmark;
benchmark.Run();
}
/**
* Test Description
* ------------------------
* - Executes `hipExtStreamGetCUMask`.
* - Creates basic mask and gets it into the new one.
* Test source
* ------------------------
* - performance/stream/hipExtStreamGetCUMask.cc
* Test requirements
* ------------------------
* - Platform specific (AMD)
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipExtStreamGetCUMask") {
RunBenchmark();
}
+69
파일 보기
@@ -0,0 +1,69 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <performance_common.hh>
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class FreeAsyncBenchmark : public Benchmark<FreeAsyncBenchmark> {
public:
void operator()(const size_t array_size) {
const StreamGuard stream_guard{Streams::created};
const hipStream_t stream = stream_guard.stream();
float* dev_ptr{nullptr};
HIP_CHECK(hipMallocAsync(reinterpret_cast<void**>(&dev_ptr), array_size * sizeof(float), stream));
TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
HIP_CHECK(hipFreeAsync(dev_ptr, stream));
}
HIP_CHECK(hipStreamSynchronize(stream));
}
};
static void RunBenchmark(const size_t array_size) {
FreeAsyncBenchmark benchmark;
benchmark.AddSectionName(std::to_string(array_size));
benchmark.Run(array_size);
}
/**
* Test Description
* ------------------------
* - Executes `hipFreeAsync` with created stream:
* -# Allocation size:
* - 4 KB
* - 4 MB
* - 16 MB
* Test source
* ------------------------
* - performance/stream/hipFreeAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipFreeAsync") {
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
RunBenchmark(array_size);
}
+68
파일 보기
@@ -0,0 +1,68 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <performance_common.hh>
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class MallocAsyncBenchmark : public Benchmark<MallocAsyncBenchmark> {
public:
void operator()(const size_t array_size) {
const StreamGuard stream_guard{Streams::created};
const hipStream_t stream = stream_guard.stream();
float* dev_ptr{nullptr};
TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
HIP_CHECK(hipMallocAsync(reinterpret_cast<void**>(&dev_ptr), array_size * sizeof(float), stream));
}
HIP_CHECK(hipStreamSynchronize(stream));
HIP_CHECK(hipFree(dev_ptr));
}
};
static void RunBenchmark(const size_t array_size) {
MallocAsyncBenchmark benchmark;
benchmark.AddSectionName(std::to_string(array_size));
benchmark.Run(array_size);
}
/**
* Test Description
* ------------------------
* - Executes `hipMallocAsync` with created stream:
* -# Allocation size:
* - 4 KB
* - 4 MB
* - 16 MB
* Test source
* ------------------------
* - performance/stream/hipMallocAsync.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipMallocAsync") {
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
RunBenchmark(array_size);
}
+82
파일 보기
@@ -0,0 +1,82 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "mem_pools_performance_common.hh"
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class MallocFromPoolAsyncBenchmark : public Benchmark<MallocFromPoolAsyncBenchmark> {
public:
void operator()(const size_t array_size) {
const StreamGuard stream_guard{Streams::created};
const hipStream_t stream = stream_guard.stream();
hipMemPool_t mem_pool{nullptr};
hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
float* array_ptr{nullptr};
TIMED_SECTION_STREAM(kTimerTypeEvent, stream) {
HIP_CHECK(hipMallocFromPoolAsync(&array_ptr, array_size * sizeof(float), mem_pool, stream));
}
REQUIRE(array_ptr != nullptr);
HIP_CHECK(hipFreeAsync(array_ptr, stream));
HIP_CHECK(hipStreamSynchronize(stream));
HIP_CHECK(hipMemPoolDestroy(mem_pool));
}
};
static void RunBenchmark(const size_t array_size) {
MallocFromPoolAsyncBenchmark benchmark;
benchmark.AddSectionName(std::to_string(array_size));
benchmark.Run(array_size);
}
/**
* Test Description
* ------------------------
* - Executes `hipMallocFromPoolAsync`:
* -# Allocation size:
* - 4 KB
* - 4 MB
* - 16 MB
* Test source
* ------------------------
* - performance/stream/hipMallocFromPoolAsync.cc
* Test requirements
* ------------------------
* - Device supports memory pools
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipMallocFromPoolAsync") {
if (!AreMemPoolsSupported(0)) {
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
"attribute. Hence skipping the testing with Pass result.\n");
return;
}
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
RunBenchmark(array_size);
}
+71
파일 보기
@@ -0,0 +1,71 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "mem_pools_performance_common.hh"
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class MemPoolCreateBenchmark : public Benchmark<MemPoolCreateBenchmark> {
public:
void operator()() {
hipMemPool_t mem_pool{nullptr};
hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
}
REQUIRE(mem_pool != nullptr);
HIP_CHECK(hipMemPoolDestroy(mem_pool));
}
};
static void RunBenchmark() {
MemPoolCreateBenchmark benchmark;
benchmark.Run();
}
/**
* @warning **MemPool APIs are not fully implemented within current version
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
* Therefore, all tests related to MemPool APIs are implemented without formal
* verification and will be verified once HIP fully supports MemPool APIs.**
* Test Description
* ------------------------
* - Executes `hipMemPoolCreate`.
* Test source
* ------------------------
* - performance/stream/hipMemPoolCreate.cc
* Test requirements
* ------------------------
* - Device supports memory pools
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipMemPoolCreate") {
if (!AreMemPoolsSupported(0)) {
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
"attribute. Hence skipping the testing with Pass result.\n");
return;
}
RunBenchmark();
}
+70
파일 보기
@@ -0,0 +1,70 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "mem_pools_performance_common.hh"
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class MemPoolDestroyBenchmark : public Benchmark<MemPoolDestroyBenchmark> {
public:
void operator()() {
hipMemPool_t mem_pool{nullptr};
hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipMemPoolDestroy(mem_pool));
}
}
};
static void RunBenchmark() {
MemPoolDestroyBenchmark benchmark;
benchmark.Run();
}
/**
* @warning **MemPool APIs are not fully implemented within current version
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
* Therefore, all tests related to MemPool APIs are implemented without formal
* verification and will be verified once HIP fully supports MemPool APIs.**
* Test Description
* ------------------------
* - Creates new mem pool.
* - Executes `hipMemPoolDestroy`.
* Test source
* ------------------------
* - performance/stream/hipMemPoolDestroy.cc
* Test requirements
* ------------------------
* - Device supports memory pools
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipMemPoolDestroy") {
if (!AreMemPoolsSupported(0)) {
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
"attribute. Hence skipping the testing with Pass result.\n");
return;
}
RunBenchmark();
}
+84
파일 보기
@@ -0,0 +1,84 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "mem_pools_performance_common.hh"
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class MemPoolExportPointerBenchmark : public Benchmark<MemPoolExportPointerBenchmark> {
public:
void operator()(const size_t array_size) {
float* device_ptr{nullptr};
hipMemPool_t mem_pool{nullptr};
hipMemPoolPtrExportData exp_data;
hipMemPoolProps props = CreateMemPoolProps(0, kHandleType);
HIP_CHECK(hipMemPoolCreate(&mem_pool, &props));
HIP_CHECK(hipMallocFromPoolAsync(&device_ptr, array_size * sizeof(float), mem_pool, nullptr));
HIP_CHECK(hipStreamSynchronize(nullptr));
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipMemPoolExportPointer(&exp_data, device_ptr));
}
HIP_CHECK(hipFreeAsync(device_ptr, nullptr));
HIP_CHECK(hipMemPoolDestroy(mem_pool));
}
};
static void RunBenchmark(const size_t array_size) {
MemPoolExportPointerBenchmark benchmark;
benchmark.AddSectionName(std::to_string(array_size));
benchmark.Run(array_size);
}
/**
* @warning **MemPool APIs are not fully implemented within current version
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
* Therefore, all tests related to MemPool APIs are implemented without formal
* verification and will be verified once HIP fully supports MemPool APIs.**
* Test Description
* ------------------------
* - Executes `hipMemPoolExportPointer`:
* -# Allocation size:
* - 4 KB
* - 4 MB
* - 16 MB
* - Uses the same process for import and export operations.
* Test source
* ------------------------
* - performance/stream/hipMemPoolExportPointer.cc
* Test requirements
* ------------------------
* - Device supports memory pools
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipMemPoolExportPointer") {
if (!AreMemPoolsSupported(0)) {
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
"attribute. Hence skipping the testing with Pass result.\n");
return;
}
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
RunBenchmark(array_size);
}
@@ -0,0 +1,74 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "mem_pools_performance_common.hh"
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class MemPoolExportToShareableHandleBenchmark : public Benchmark<MemPoolExportToShareableHandleBenchmark> {
public:
void operator()() {
hipMemPool_t mem_pool{nullptr};
int share_handle;
hipMemPoolProps props = CreateMemPoolProps(0, kHandleType);
HIP_CHECK(hipMemPoolCreate(&mem_pool, &props));
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipMemPoolExportToShareableHandle(&share_handle, mem_pool, kHandleType, 0));
}
HIP_CHECK(hipMemPoolDestroy(mem_pool));
}
};
static void RunBenchmark() {
MemPoolExportToShareableHandleBenchmark benchmark;
benchmark.Run();
}
/**
* @warning **MemPool APIs are not fully implemented within current version
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
* Therefore, all tests related to MemPool APIs are implemented without formal
* verification and will be verified once HIP fully supports MemPool APIs.**
* Test Description
* ------------------------
* - Executes `hipMemPoolExportToShareableHandle`.
* - Uses the same process for import and export operations.
* Test source
* ------------------------
* - performance/stream/hipMemPoolExportToShareableHandle.cc
* Test requirements
* ------------------------
* - Device supports memory pools
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipMemPoolExportToShareableHandle") {
if (!AreMemPoolsSupported(0)) {
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
"attribute. Hence skipping the testing with Pass result.\n");
return;
}
RunBenchmark();
}
+76
파일 보기
@@ -0,0 +1,76 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "mem_pools_performance_common.hh"
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class MemPoolGetAccessBenchmark : public Benchmark<MemPoolGetAccessBenchmark> {
public:
void operator()() {
hipMemPool_t mem_pool{nullptr};
hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
hipMemAccessFlags flags = hipMemAccessFlagsProtNone;
hipMemLocation location = {
hipMemLocationTypeDevice,
0
};
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipMemPoolGetAccess(&flags, mem_pool, location));
}
HIP_CHECK(hipMemPoolDestroy(mem_pool));
}
};
static void RunBenchmark() {
MemPoolGetAccessBenchmark benchmark;
benchmark.Run();
}
/**
* @warning **MemPool APIs are not fully implemented within current version
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
* Therefore, all tests related to MemPool APIs are implemented without formal
* verification and will be verified once HIP fully supports MemPool APIs.**
* Test Description
* ------------------------
* - Executes `hipMemPoolGetAccess`.
* Test source
* ------------------------
* - performance/stream/hipMemPoolGetAccess.cc
* Test requirements
* ------------------------
* - Device supports memory pools
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipMemPoolGetAccess") {
if (!AreMemPoolsSupported(0)) {
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
"attribute. Hence skipping the testing with Pass result.\n");
return;
}
RunBenchmark();
}
+83
파일 보기
@@ -0,0 +1,83 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "mem_pools_performance_common.hh"
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class MemPoolGetAttributeBenchmark : public Benchmark<MemPoolGetAttributeBenchmark> {
public:
void operator()(const hipMemPoolAttr attribute) {
hipMemPool_t mem_pool{nullptr};
hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
uint64_t value{0};
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attribute, &value));
}
HIP_CHECK(hipMemPoolDestroy(mem_pool));
}
};
static void RunBenchmark(const hipMemPoolAttr attribute) {
MemPoolGetAttributeBenchmark benchmark;
benchmark.AddSectionName(GetMemPoolAttrSectionName(attribute));
benchmark.Run(attribute);
}
/**
* @warning **MemPool APIs are not fully implemented within current version
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
* Therefore, all tests related to MemPool APIs are implemented without formal
* verification and will be verified once HIP fully supports MemPool APIs.**
* Test Description
* ------------------------
* - Executes `hipMemPoolGetAttribute`:
* -# Supported attributes:
* - `hipMemPoolAttrReleaseThreshold`
* - `hipMemPoolReuseFollowEventDependencies`
* - `hipMemPoolReuseAllowOpportunistic`
* - `hipMemPoolReuseAllowInternalDependencies`
* Test source
* ------------------------
* - performance/stream/hipMemPoolGetAttribute.cc
* Test requirements
* ------------------------
* - Device supports memory pools
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipMemPoolGetAttribute") {
if (!AreMemPoolsSupported(0)) {
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
"attribute. Hence skipping the testing with Pass result.\n");
return;
}
hipMemPoolAttr attribute = GENERATE(hipMemPoolAttrReleaseThreshold,
hipMemPoolReuseFollowEventDependencies,
hipMemPoolReuseAllowOpportunistic,
hipMemPoolReuseAllowInternalDependencies);
RunBenchmark(attribute);
}
@@ -0,0 +1,75 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "mem_pools_performance_common.hh"
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class MemPoolImportFromShareableHandleBenchmark : public Benchmark<MemPoolImportFromShareableHandleBenchmark> {
public:
void operator()() {
hipMemPool_t mem_pool{nullptr};
int share_handle;
hipMemPoolProps props = CreateMemPoolProps(0, kHandleType);
HIP_CHECK(hipMemPoolCreate(&mem_pool, &props));
HIP_CHECK(hipMemPoolExportToShareableHandle(&share_handle, mem_pool, kHandleType, 0));
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipMemPoolImportFromShareableHandle(&mem_pool, &share_handle, kHandleType, 0));
}
HIP_CHECK(hipMemPoolDestroy(mem_pool));
}
};
static void RunBenchmark() {
MemPoolImportFromShareableHandleBenchmark benchmark;
benchmark.Run();
}
/**
* @warning **MemPool APIs are not fully implemented within current version
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
* Therefore, all tests related to MemPool APIs are implemented without formal
* verification and will be verified once HIP fully supports MemPool APIs.**
* Test Description
* ------------------------
* - Executes `hipMemPoolImportFromShareableHandle`.
* - Uses the same process for import and export operations.
* Test source
* ------------------------
* - performance/stream/hipMemPoolImportFromShareableHandle.cc
* Test requirements
* ------------------------
* - Device supports memory pools
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipMemPoolImportFromShareableHandle") {
if (!AreMemPoolsSupported(0)) {
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
"attribute. Hence skipping the testing with Pass result.\n");
return;
}
RunBenchmark();
}
+87
파일 보기
@@ -0,0 +1,87 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "mem_pools_performance_common.hh"
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class MemPoolImportPointerBenchmark : public Benchmark<MemPoolImportPointerBenchmark> {
public:
void operator()(const size_t array_size) {
float* device_ptr{nullptr};
float* device_ptr_import{nullptr};
hipMemPool_t mem_pool{nullptr};
hipMemPoolPtrExportData exp_data;
hipMemPoolProps props = CreateMemPoolProps(0, kHandleType);
HIP_CHECK(hipMemPoolCreate(&mem_pool, &props));
HIP_CHECK(hipMallocFromPoolAsync(&device_ptr, array_size * sizeof(float), mem_pool, nullptr));
HIP_CHECK(hipStreamSynchronize(nullptr));
HIP_CHECK(hipMemPoolExportPointer(&exp_data, device_ptr));
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipMemPoolImportPointer(reinterpret_cast<void**>(device_ptr_import), mem_pool, &exp_data));
}
HIP_CHECK(hipFree(device_ptr));
HIP_CHECK(hipFree(device_ptr_import));
HIP_CHECK(hipMemPoolDestroy(mem_pool));
}
};
static void RunBenchmark(const size_t array_size) {
MemPoolImportPointerBenchmark benchmark;
benchmark.AddSectionName(std::to_string(array_size));
benchmark.Run(array_size);
}
/**
* @warning **MemPool APIs are not fully implemented within current version
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
* Therefore, all tests related to MemPool APIs are implemented without formal
* verification and will be verified once HIP fully supports MemPool APIs.**
* Test Description
* ------------------------
* - Executes `hipMemPoolImportPointer`:
* -# Allocation size:
* - 4 KB
* - 4 MB
* - 16 MB
* - Uses the same process for import and export operations.
* Test source
* ------------------------
* - performance/stream/hipMemPoolImportPointer.cc
* Test requirements
* ------------------------
* - Device supports memory pools
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipMemPoolImportPointer") {
if (!AreMemPoolsSupported(0)) {
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
"attribute. Hence skipping the testing with Pass result.\n");
return;
}
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
RunBenchmark(array_size);
}
+79
파일 보기
@@ -0,0 +1,79 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "mem_pools_performance_common.hh"
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class MemPoolSetAccessBenchmark : public Benchmark<MemPoolSetAccessBenchmark> {
public:
void operator()() {
hipMemPool_t mem_pool{nullptr};
hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
hipMemAccessDesc desc_list = {
{
hipMemLocationTypeDevice,
0
},
hipMemAccessFlagsProtReadWrite
};
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipMemPoolSetAccess(mem_pool, &desc_list, 1));
}
HIP_CHECK(hipMemPoolDestroy(mem_pool));
}
};
static void RunBenchmark() {
MemPoolSetAccessBenchmark benchmark;
benchmark.Run();
}
/**
* @warning **MemPool APIs are not fully implemented within current version
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
* Therefore, all tests related to MemPool APIs are implemented without formal
* verification and will be verified once HIP fully supports MemPool APIs.**
* Test Description
* ------------------------
* - Executes `hipMemPoolSetAccess` with `hipMemAccessFlagsProtReadWrite`.
* Test source
* ------------------------
* - performance/stream/hipMemPoolSetAccess.cc
* Test requirements
* ------------------------
* - Device supports memory pools
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipMemPoolSetAccess") {
if (!AreMemPoolsSupported(0)) {
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
"attribute. Hence skipping the testing with Pass result.\n");
return;
}
RunBenchmark();
}
+83
파일 보기
@@ -0,0 +1,83 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "mem_pools_performance_common.hh"
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class MemPoolSetAttributeBenchmark : public Benchmark<MemPoolSetAttributeBenchmark> {
public:
void operator()(const hipMemPoolAttr attribute) {
hipMemPool_t mem_pool{nullptr};
hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
int value{0};
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attribute, &value));
}
HIP_CHECK(hipMemPoolDestroy(mem_pool));
}
};
static void RunBenchmark(const hipMemPoolAttr attribute) {
MemPoolSetAttributeBenchmark benchmark;
benchmark.AddSectionName(GetMemPoolAttrSectionName(attribute));
benchmark.Run(attribute);
}
/**
* @warning **MemPool APIs are not fully implemented within current version
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
* Therefore, all tests related to MemPool APIs are implemented without formal
* verification and will be verified once HIP fully supports MemPool APIs.**
* Test Description
* ------------------------
* - Executes `hipMemPoolSetAttribute`:
* -# Supported attributes:
* - `hipMemPoolAttrReleaseThreshold`
* - `hipMemPoolReuseFollowEventDependencies`
* - `hipMemPoolReuseAllowOpportunistic`
* - `hipMemPoolReuseAllowInternalDependencies`
* Test source
* ------------------------
* - performance/stream/hipMemPoolSetAttribute.cc
* Test requirements
* ------------------------
* - Device supports memory pools
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipMemPoolSetAttribute") {
if (!AreMemPoolsSupported(0)) {
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
"attribute. Hence skipping the testing with Pass result.\n");
return;
}
hipMemPoolAttr attribute = GENERATE(hipMemPoolAttrReleaseThreshold,
hipMemPoolReuseFollowEventDependencies,
hipMemPoolReuseAllowOpportunistic,
hipMemPoolReuseAllowInternalDependencies);
RunBenchmark(attribute);
}
+77
파일 보기
@@ -0,0 +1,77 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "mem_pools_performance_common.hh"
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class MemPoolTrimToBenchmark : public Benchmark<MemPoolTrimToBenchmark> {
public:
void operator()(const size_t min_bytes_to_hold) {
hipMemPool_t mem_pool{nullptr};
hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone);
HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props));
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipMemPoolTrimTo(mem_pool, min_bytes_to_hold));
}
HIP_CHECK(hipMemPoolDestroy(mem_pool));
}
};
static void RunBenchmark(const size_t min_bytes_to_hold) {
MemPoolTrimToBenchmark benchmark;
benchmark.AddSectionName(std::to_string(min_bytes_to_hold));
benchmark.Run(min_bytes_to_hold);
}
/**
* @warning **MemPool APIs are not fully implemented within current version
* or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms.
* Therefore, all tests related to MemPool APIs are implemented without formal
* verification and will be verified once HIP fully supports MemPool APIs.**
* Test Description
* ------------------------
* - Executes `hipMemPoolTrimTo`:
* -# Minimum bytes to hold:
* - 4 KB
* - 4 MB
* - 16 MB
* Test source
* ------------------------
* - performance/stream/hipMemPoolTrimTo.cc
* Test requirements
* ------------------------
* - Device supports memory pools
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipMemPoolTrimTo") {
if (!AreMemPoolsSupported(0)) {
HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported "
"attribute. Hence skipping the testing with Pass result.\n");
return;
}
size_t min_bytes_to_hold = GENERATE(4_KB, 4_MB, 16_MB);
RunBenchmark(min_bytes_to_hold);
}
+61
파일 보기
@@ -0,0 +1,61 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <performance_common.hh>
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
void Callback(hipStream_t stream, hipError_t status, void* user_data) {}
class StreamAddCallbackBenchmark : public Benchmark<StreamAddCallbackBenchmark> {
public:
void operator()() {
const StreamGuard stream_guard{Streams::created};
const hipStream_t stream = stream_guard.stream();
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipStreamAddCallback(stream, Callback, nullptr, 0));
}
}
};
static void RunBenchmark() {
StreamAddCallbackBenchmark benchmark;
benchmark.Run();
}
/**
* Test Description
* ------------------------
* - Executes `hipStreamAddCallback` on the created stream.
* Test source
* ------------------------
* - performance/stream/hipStreamAddCallback.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipStreamAddCallback") {
RunBenchmark();
}
+269
파일 보기
@@ -0,0 +1,269 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <performance_common.hh>
#include <resource_guards.hh>
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
* Contains performance tests for all hipStream related APIs
*/
class HipDeviceGetStreamPriorityRangeBenchmark : public Benchmark<HipDeviceGetStreamPriorityRangeBenchmark> {
public:
void operator()() {
int priority_min, priority_max;
TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipDeviceGetStreamPriorityRange(&priority_min, &priority_max)); }
}
};
class HipStreamQueryBenchmark : public Benchmark<HipStreamQueryBenchmark> {
public:
void operator()(bool perform_work) {
hipError_t error;
hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
void *dptr;
if(perform_work) {
HIP_CHECK(hipMallocAsync(&dptr, 2048 * 4, stream));
}
TIMED_SECTION(kTimerTypeCpu) { error = hipStreamQuery(stream); }
if(perform_work) {
HIP_CHECK(hipFreeAsync(dptr, stream));
HIP_CHECK(hipStreamSynchronize(stream));
}
HIP_CHECK(hipStreamDestroy(stream));
}
};
class HipStreamSynchronizeBenchmark : public Benchmark<HipStreamSynchronizeBenchmark> {
public:
void operator()() {
hipError_t error;
hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
TIMED_SECTION(kTimerTypeCpu) { error = hipStreamSynchronize(stream); }
HIP_CHECK(hipStreamDestroy(stream));
}
};
class HipStreamDestroyBenchmark : public Benchmark<HipStreamDestroyBenchmark> {
public:
void operator()() {
hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamDestroy(stream)); }
}
};
class HipStreamCreateBenchmark : public Benchmark<HipStreamCreateBenchmark> {
public:
void operator()() {
hipStream_t stream;
TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamCreate(&stream)); }
HIP_CHECK(hipStreamDestroy(stream));
}
};
class HipStreamCreateWithPriorityBenchmark : public Benchmark<HipStreamCreateWithPriorityBenchmark> {
public:
void operator()(unsigned int flag) {
hipStream_t stream;
int priority_min, priority_max, priority_mid;
HIP_CHECK(hipDeviceGetStreamPriorityRange(&priority_min, &priority_max));
priority_mid = (priority_max + priority_min) / 2;
TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamCreateWithPriority(&stream, flag, priority_mid)); }
HIP_CHECK(hipStreamDestroy(stream));
}
};
static std::string GetStreamCreateFlagName(unsigned flag) {
switch (flag) {
case hipStreamDefault:
return "hipStreamDefault";
case hipStreamNonBlocking:
return "hipStreamNonBlocking";
default:
return "flag combination";
}
}
class HipStreamCreateWithFlagsBenchmark : public Benchmark<HipStreamCreateWithFlagsBenchmark> {
public:
void operator()(unsigned int flag) {
hipStream_t stream;
TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamCreateWithFlags(&stream, flag)); }
HIP_CHECK(hipStreamDestroy(stream));
}
};
/**
* Test Description
* ------------------------
* - Executes `hipStreamCreate`:
* Test source
* ------------------------
* - performance/stream/hipStreamBasic.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipStreamCreate") {
HipStreamCreateBenchmark benchmark;
benchmark.Run();
}
static void RunBenchmark(unsigned flag) {
HipStreamCreateWithFlagsBenchmark benchmark;
benchmark.AddSectionName(GetStreamCreateFlagName(flag));
benchmark.Run(flag);
}
static void RunBenchmarkWithPriority(unsigned flag) {
HipStreamCreateWithPriorityBenchmark benchmark;
benchmark.AddSectionName(GetStreamCreateFlagName(flag));
benchmark.Run(flag);
}
/**
* Test Description
* ------------------------
* - Executes `hipStreamCreateWithFlags` with all flags:
* -# Flags
* - hipStreamDefault
* - hipStreamNonBlocking
* Test source
* ------------------------
* - performance/stream/hipStreamBasic.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipStreamCreateWithFlags") {
const auto flag = GENERATE(hipStreamDefault, hipStreamNonBlocking);
RunBenchmark(flag);
}
/**
* Test Description
* ------------------------
* - Executes `hipStreamCreateWithPriority` with all flags:
* -# Flags
* - hipStreamDefault
* - hipStreamNonBlocking
* Test source
* ------------------------
* - performance/stream/hipStreamBasic.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipStreamCreateWithPriority") {
const auto flag = GENERATE(hipStreamDefault, hipStreamNonBlocking);
RunBenchmarkWithPriority(flag);
}
/**
* Test Description
* ------------------------
* - Executes `hipStreamDestroy`:
* Test source
* ------------------------
* - performance/stream/hipStreamBasic.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipStreamDestroy") {
HipStreamDestroyBenchmark benchmark;
benchmark.Run();
}
/**
* Test Description
* ------------------------
* - Executes `hipDeviceGetStreamPriorityRange`:
* Test source
* ------------------------
* - performance/stream/hipStreamBasic.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipDeviceGetStreamPriorityRange") {
HipDeviceGetStreamPriorityRangeBenchmark benchmark;
benchmark.Run();
}
/**
* Test Description
* ------------------------
* - Executes `hipStreamQuery`:
* Test source
* ------------------------
* - performance/stream/hipStreamBasic.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipStreamQuery") {
const auto perform_work = GENERATE(true, false);
HipStreamQueryBenchmark benchmark;
if(perform_work) {
benchmark.AddSectionName("stream with work");
} else {
benchmark.AddSectionName("stream without work");
}
benchmark.Run(perform_work);
}
/**
* Test Description
* ------------------------
* - Executes `hipDeviceGetStreamPriorityRange`:
* Test source
* ------------------------
* - performance/stream/hipStreamBasic.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipStreamSynchronize") {
HipStreamSynchronizeBenchmark benchmark;
benchmark.Run();
}
+75
파일 보기
@@ -0,0 +1,75 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <performance_common.hh>
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class StreamGetFlagsBenchmark : public Benchmark<StreamGetFlagsBenchmark> {
public:
void operator()(unsigned int expected_flag) {
unsigned int returned_flags{};
hipStream_t stream;
HIP_CHECK(hipStreamCreateWithFlags(&stream, expected_flag));
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipStreamGetFlags(stream, &returned_flags))
}
HIP_CHECK(hipStreamDestroy(stream));
}
};
static void RunBenchmark(unsigned int expected_flag) {
StreamGetFlagsBenchmark benchmark;
switch (expected_flag) {
case hipStreamDefault:
benchmark.AddSectionName("hipStreamDefault");
break;
case hipStreamNonBlocking:
benchmark.AddSectionName("hipStreamNonBlocking");
break;
default:
benchmark.AddSectionName("unknown flag type");
}
benchmark.Run(expected_flag);
}
/**
* Test Description
* ------------------------
* - Executes `hipStreamGetFlags`:
* -# Flags:
* - `hipStreamDefault`
* - `hipStreamNonBlocking`
* Test source
* ------------------------
* - performance/stream/hipStreamGetFlags.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipStreamGetFlags") {
unsigned int expected_flag = GENERATE(hipStreamDefault, hipStreamNonBlocking);
RunBenchmark(expected_flag);
}
+74
파일 보기
@@ -0,0 +1,74 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <performance_common.hh>
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class StreamGetPriorityBenchmark : public Benchmark<StreamGetPriorityBenchmark> {
public:
void operator()(Streams stream_type) {
const StreamGuard stream_guard{stream_type};
const hipStream_t stream = stream_guard.stream();
int priority{};
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipStreamGetPriority(stream, &priority));
}
}
};
static void RunBenchmark(Streams stream_type) {
StreamGetPriorityBenchmark benchmark;
switch (stream_type) {
case Streams::nullstream:
benchmark.AddSectionName("null stream");
break;
case Streams::created:
benchmark.AddSectionName("created");
break;
default:
benchmark.AddSectionName("per thread stream");
}
benchmark.Run(stream_type);
}
/**
* Test Description
* ------------------------
* - Executes `hipStreamGetPriority`:
* -# Stream types:
* - `null`
* - created
* Test source
* ------------------------
* - performance/stream/hipStreamGetPriority.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipStreamGetPriority") {
Streams stream_type = GENERATE(Streams::nullstream, Streams::created);
RunBenchmark(stream_type);
}
+80
파일 보기
@@ -0,0 +1,80 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <performance_common.hh>
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
class StreamWaitEventBenchmark : public Benchmark<StreamWaitEventBenchmark> {
public:
void operator()(Streams stream_type) {
const StreamGuard stream_guard{stream_type};
const hipStream_t stream = stream_guard.stream();
hipEvent_t wait_event{nullptr};
HIP_CHECK(hipEventCreate(&wait_event));
REQUIRE(wait_event != nullptr);
HIP_CHECK(hipEventRecord(wait_event, stream));
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipStreamWaitEvent(stream, wait_event, 0));
HIP_CHECK(hipStreamSynchronize(stream));
}
HIP_CHECK(hipEventDestroy(wait_event));
}
};
static void RunBenchmark(Streams stream_type) {
StreamWaitEventBenchmark benchmark{};
switch (stream_type) {
case Streams::nullstream:
benchmark.AddSectionName("null stream");
break;
case Streams::created:
benchmark.AddSectionName("created");
break;
default:
benchmark.AddSectionName("per thread stream");
}
benchmark.Run(stream_type);
}
/**
* Test Description
* ------------------------
* - Executes `hipStreamWaitEvent`:
* -# Stream types:
* - `null`
* - created
* Test source
* ------------------------
* - performance/stream/hipStreamWaitEvent.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipStreamWaitEvent") {
Streams stream_type = GENERATE(Streams::nullstream, Streams::created);
RunBenchmark(stream_type);
}
+172
파일 보기
@@ -0,0 +1,172 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <performance_common.hh>
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
static int IsStreamWaitValueSupported(int device_id) {
int wait_value_supported = 0;
#if HT_AMD
HIP_CHECK(hipDeviceGetAttribute(&wait_value_supported, hipDeviceAttributeCanUseStreamWaitValue,
device_id));
#else
cuDeviceGetAttribute(&wait_value_supported, CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS,
device_id);
#endif
return wait_value_supported;
}
class StreamWaitValue32Benchmark : public Benchmark<StreamWaitValue32Benchmark> {
public:
void operator()(const size_t array_size, unsigned int flag) {
const StreamGuard stream_guard{Streams::created};
const hipStream_t stream = stream_guard.stream();
uint32_t* value_ptr;
uint32_t value{0};
if (flag == hipStreamWaitValueAnd) {
value = 1;
}
HIP_CHECK(hipMalloc(&value_ptr, sizeof(uint32_t) * array_size));
HIP_CHECK(hipMemset(value_ptr, value, sizeof(uint32_t) * array_size));
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipStreamWaitValue32(stream, value_ptr, value, flag));
}
HIP_CHECK(hipFree(value_ptr));
}
};
class StreamWaitValue64Benchmark : public Benchmark<StreamWaitValue64Benchmark> {
public:
void operator()(const size_t array_size, unsigned int flag) {
const StreamGuard stream_guard{Streams::created};
const hipStream_t stream = stream_guard.stream();
uint64_t* value_ptr;
uint64_t value{0};
if (flag == hipStreamWaitValueAnd) {
value = 1;
}
HIP_CHECK(hipMalloc(&value_ptr, sizeof(uint64_t) * array_size));
HIP_CHECK(hipMemset(value_ptr, value, sizeof(uint64_t) * array_size));
TIMED_SECTION(kTimerTypeCpu) {
HIP_CHECK(hipStreamWaitValue64(stream, value_ptr, value, flag));
}
HIP_CHECK(hipFree(value_ptr));
}
};
template <typename WaitValueBenchmark>
static void RunBenchmark(const size_t array_size, unsigned int flag) {
WaitValueBenchmark benchmark;
benchmark.AddSectionName(std::to_string(array_size));
switch (flag) {
case hipStreamWaitValueGte:
benchmark.AddSectionName("greater than or equal");
break;
case hipStreamWaitValueEq:
benchmark.AddSectionName("equal");
break;
case hipStreamWaitValueAnd:
benchmark.AddSectionName("logical and");
break;
case hipStreamWaitValueNor:
benchmark.AddSectionName("logical nor");
break;
default:
benchmark.AddSectionName("unknown flag");
}
benchmark.Run(array_size, flag);
}
/**
* Test Description
* ------------------------
* - Executes `hipStreamWaitValue32` for different array sizes:
* -# 4 KB
* -# 4 MB
* -# 16 MB
* - Uses different flag types for wait criteria:
* -# Greater than or equal
* -# Equal
* -# Logical AND
* -# Logical OR
* Test source
* ------------------------
* - performance/stream/hipStreamWaitValue.cc
* Test requirements
* ------------------------
* - Device supports Stream Wait Value operations
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipStreamWaitValue32") {
#if HT_AMD
if (!IsStreamWaitValueSupported(0)) {
HipTest::HIP_SKIP_TEST(
"GPU 0 doesn't support hipStreamWaitValue32() function. "
"Hence skipping the testing with Pass result.\n");
return;
}
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
unsigned int flag = GENERATE(hipStreamWaitValueGte, hipStreamWaitValueEq, hipStreamWaitValueAnd,
hipStreamWaitValueNor);
RunBenchmark<StreamWaitValue32Benchmark>(array_size, flag);
#endif
}
/**
* Test Description
* ------------------------
* - Executes `hipStreamWaitValue64`:
* -# Allocation size:
* - 4 KB
* - 4 MB
* - 16 MB
* -# Wait type:
* - Greater than or equal
* - Equal
* - Logical AND
* - Logical OR
* Test source
* ------------------------
* - performance/stream/hipStreamWaitValue.cc
* Test requirements
* ------------------------
* - Device supports Stream Wait Value operations
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipStreamWaitValue64") {
if (!IsStreamWaitValueSupported(0)) {
HipTest::HIP_SKIP_TEST(
"GPU 0 doesn't support hipStreamWaitValue64() function. "
"Hence skipping the testing with Pass result.\n");
return;
}
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
unsigned int flag = GENERATE(hipStreamWaitValueGte, hipStreamWaitValueEq, hipStreamWaitValueAnd,
hipStreamWaitValueNor);
RunBenchmark<StreamWaitValue64Benchmark>(array_size, flag);
}
+123
파일 보기
@@ -0,0 +1,123 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <performance_common.hh>
/**
* @addtogroup stream stream
* @{
* @ingroup PerformanceTest
*/
#if HT_NVIDIA
static int IsStreamWriteValueSupported(int device_id) {
int write_value_supported = 0;
cuDeviceGetAttribute(&write_value_supported, CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS,
device_id);
return write_value_supported;
}
#endif
class StreamWriteValue32Benchmark : public Benchmark<StreamWriteValue32Benchmark> {
public:
void operator()(const size_t array_size) {
const StreamGuard stream_guard{Streams::created};
const hipStream_t stream = stream_guard.stream();
uint32_t* value_ptr;
uint32_t value{0};
HIP_CHECK(hipMalloc(&value_ptr, sizeof(uint32_t) * array_size));
HIP_CHECK(hipMemset(value_ptr, value, sizeof(uint32_t) * array_size));
TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamWriteValue32(stream, value_ptr, value, 0)); }
HIP_CHECK(hipFree(value_ptr));
}
};
class StreamWriteValue64Benchmark : public Benchmark<StreamWriteValue64Benchmark> {
public:
void operator()(const size_t array_size) {
const StreamGuard stream_guard{Streams::created};
const hipStream_t stream = stream_guard.stream();
uint64_t* value_ptr;
uint64_t value{0};
HIP_CHECK(hipMalloc(&value_ptr, sizeof(uint64_t) * array_size));
HIP_CHECK(hipMemset(value_ptr, value, sizeof(uint64_t) * array_size));
TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamWriteValue64(stream, value_ptr, value, 0)); }
HIP_CHECK(hipFree(value_ptr));
}
};
template <typename WriteValueBenchmark> static void RunBenchmark(const size_t array_size) {
WriteValueBenchmark benchmark;
benchmark.AddSectionName(std::to_string(array_size));
benchmark.Run(array_size);
}
/**
* Test Description
* ------------------------
* - Executes `hipStreamWriteValue32`:
* -# Allocation size:
* - 4 KB
* - 4 MB
* - 16 MB
* Test source
* ------------------------
* - performance/stream/hipStreamWriteValue.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipStreamWriteValue32") {
#if HT_AMD
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
RunBenchmark<StreamWriteValue32Benchmark>(array_size);
#endif
}
/**
* Test Description
* ------------------------
* - Executes `hipStreamWriteValue64`:
* -# Allocation size:
* - 4 KB
* - 4 MB
* - 16 MB
* Test source
* ------------------------
* - performance/stream/hipStreamWriteValue.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Performance_hipStreamWriteValue64") {
#if HT_NVIDIA
if (!IsStreamWriteValueSupported(0)) {
HipTest::HIP_SKIP_TEST(
"GPU 0 doesn't support hipStreamWriteValue64() function. "
"Hence skipping the testing with Pass result.\n");
return;
}
#endif
size_t array_size = GENERATE(4_KB, 4_MB, 16_MB);
RunBenchmark<StreamWriteValue64Benchmark>(array_size);
}
+74
파일 보기
@@ -0,0 +1,74 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <hip_test_common.hh>
#include <performance_common.hh>
#if __linux__
static const hipMemAllocationHandleType kHandleType = hipMemHandleTypePosixFileDescriptor;
#else
static const hipMemAllocationHandleType kHandleType = hipMemHandleTypeWin32;
#endif
static int AreMemPoolsSupported(int device_id) {
int mem_pools_supported = 0;
HIP_CHECK(hipDeviceGetAttribute(&mem_pools_supported,
hipDeviceAttributeMemoryPoolsSupported, 0));
return mem_pools_supported;
}
static hipMemPoolProps CreateMemPoolProps(const int device_id, const hipMemAllocationHandleType handle_type) {
hipMemPoolProps kPoolProps = {
hipMemAllocationTypePinned,
handle_type,
{
hipMemLocationTypeDevice,
device_id
},
nullptr,
{0}
};
return kPoolProps;
}
static std::string GetMemPoolAttrSectionName(const hipMemPoolAttr attribute) {
switch (attribute) {
case hipMemPoolReuseFollowEventDependencies:
return "ReuseFollowEventDependencies";
case hipMemPoolReuseAllowOpportunistic:
return "ReuseAllowOpportunistic";
case hipMemPoolReuseAllowInternalDependencies:
return "ReuseAllowInternalDependencies";
case hipMemPoolAttrReleaseThreshold:
return "AttrReleaseThreshold";
case hipMemPoolAttrReservedMemCurrent:
return "AttrReservedMemCurrent";
case hipMemPoolAttrReservedMemHigh:
return "AttrReservedMemHigh";
case hipMemPoolAttrUsedMemCurrent:
return "AttrUsedMemCurrent";
case hipMemPoolAttrUsedMemHigh:
return "AttrUsedMemHigh";
default:
return "unknown attribute";
}
}
+4 -1
파일 보기
@@ -36,11 +36,14 @@ add_subdirectory(compiler)
add_subdirectory(errorHandling)
add_subdirectory(cooperativeGrps)
add_subdirectory(context)
add_subdirectory(warp)
add_subdirectory(dynamicLoading)
add_subdirectory(g++)
add_subdirectory(module)
add_subdirectory(channelDescriptor)
add_subdirectory(executionControl)
add_subdirectory(vector_types)
add_subdirectory(atomics)
add_subdirectory(p2p)
add_subdirectory(gcc)
@@ -49,5 +52,5 @@ add_subdirectory(callback)
add_subdirectory(clock)
# Vulkan interop APIs currently undefined for Nvidia
add_subdirectory(vulkan_interop)
add_subdirectory(gl_interop) # Disabled on NVIDIA due to defect - EXSWHTEC-246
endif()
add_subdirectory(vector_types)
+48
파일 보기
@@ -0,0 +1,48 @@
# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
set(TEST_SRC
atomicExch.cc
atomicExch_system.cc
)
if(HIP_PLATFORM MATCHES "nvidia")
set_source_files_properties(atomicExch_system.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
hip_add_exe_to_target(NAME AtomicsTest
TEST_SRC ${TEST_SRC}
TEST_TARGET_NAME build_tests
LINKER_LIBS "nvrtc -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
elseif(HIP_PLATFORM MATCHES "amd")
hip_add_exe_to_target(NAME AtomicsTest
TEST_SRC ${TEST_SRC}
TEST_TARGET_NAME build_tests
LINKER_LIBS hiprtc)
endif()
# SWDEV-435667: Below 2 tests failed in stress test on 01/12/23
#add_test(NAME Unit_atomicExch_Negative_Parameters
# COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
# ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
# atomicExch_negative_kernels.cc 40)
#
#add_test(NAME Unit_atomicExch_system_Negative_Parameters
# COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py
# ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH}
# atomicExch_system_negative_kernels.cc 40)
+213
파일 보기
@@ -0,0 +1,213 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "atomicExch_common.hh"
#include "atomicExch_negative_kernels_rtc.hh"
/**
* @addtogroup atomicExch atomicExch
* @{
* @ingroup AtomicsTest
*/
/**
* Test Description
* ------------------------
* - Executes a kernel wherein all threads will perform an atomic exchange in the same(compile
* time deducible) memory location. Each thread will exchange its own grid wide linear index + 1
* into the memory location, storing the return value into a separate output array slot
* corresponding to it. Once complete, the union of output array and exchange memory is validated to
* contain all values in the range [0, number_of_threads].
*
* - The test is run for:
* - All overloads of atomicExch
* - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
* - Exchange memory located in shared memory
* - Several grid and block dimension combinations(only one block is used for shared memory)
* Test source
* ------------------------
* - unit/atomics/atomicExch.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
#if HT_NVIDIA
TEMPLATE_TEST_CASE("Unit_atomicExch_Positive_Same_Address_Compile_Time", "", int, unsigned int,
unsigned long long, float) {
#else
TEMPLATE_TEST_CASE("Unit_atomicExch_Positive_Same_Address_Compile_Time", "", int, unsigned int,
unsigned long, unsigned long long, float, double) {
#endif // HT_NVIDIA
for (auto current = 0; current < cmd_options.iterations; ++current) {
AtomicExchSameAddressTest<TestType, AtomicScopes::device>();
}
}
/**
* Test Description
* ------------------------
* - Executes a single kernel on a single device wherein all threads will perform an atomic
* exchange into a runtime determined memory location. Each thread will exchange its own grid wide
* linear index + offset into the memory location, storing the return value into a separate output
* array slot corresponding to it. Once complete, the union of output array and exchange memory is
* validated to contain all values in the range [0, number_of_threads +
* number_of_exchange_memory_slots). Several memory access patterns are tested:
* -# All threads exchange to a single memory location
* -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
* for indexing
* -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
*
* - The test is run for:
* - All overloads of atomicExch
* - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
* - Exchange memory located in shared memory
* - Several grid and block dimension combinations(only one block is used for shared memory)
* Test source
* ------------------------
* - unit/atomics/atomicExch.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
#if HT_NVIDIA
TEMPLATE_TEST_CASE("Unit_atomicExch_Positive", "", int, unsigned int,
unsigned long long, float) {
#else
TEMPLATE_TEST_CASE("Unit_atomicExch_Positive", "", int, unsigned int, unsigned long,
unsigned long long, float, double) {
#endif // HT_NVIDIA
int warp_size = 0;
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
const auto cache_line_size = 128u;
for (auto current = 0; current < cmd_options.iterations; ++current) {
DYNAMIC_SECTION("Same address " << current) {
AtomicExchSingleDeviceSingleKernelTest<TestType, AtomicScopes::device>(1, sizeof(TestType));
}
DYNAMIC_SECTION("Adjacent addresses " << current) {
AtomicExchSingleDeviceSingleKernelTest<TestType, AtomicScopes::device>(warp_size,
sizeof(TestType));
}
DYNAMIC_SECTION("Scattered addresses " << current) {
AtomicExchSingleDeviceSingleKernelTest<TestType, AtomicScopes::device>(warp_size,
cache_line_size);
}
}
}
/**
* Test Description
* ------------------------
* - Executes a kernel two times concurrently on a single device wherein all threads will perform
* an atomic exchange into a runtime determined memory location. Each thread will exchange its own
* grid wide linear index + offset into the memory location, storing the return value into a
* separate output array slot corresponding to it. Once complete, the union of output array and
* exchange memory is validated to contain all values in the range [0, number_of_threads +
* number_of_exchange_memory_slots). Several memory access patterns are tested:
* -# All threads exchange to a single memory location
* -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
* for indexing
* -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
*
* - The test is run for:
* - All overloads of atomicExch
* - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
* - Several grid and block dimension combinations
* Test source
* ------------------------
* - unit/atomics/atomicExch.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
#if HT_NVIDIA
TEMPLATE_TEST_CASE("Unit_atomicExch_Positive_Multi_Kernel", "", int, unsigned int,
unsigned long long, float) {
#else
TEMPLATE_TEST_CASE("Unit_atomicExch_Positive_Multi_Kernel", "", int, unsigned int, unsigned long,
unsigned long long, float, double) {
#endif // HT_NVIDIA
int warp_size = 0;
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
const auto cache_line_size = 128u;
for (auto current = 0; current < cmd_options.iterations; ++current) {
DYNAMIC_SECTION("Same address " << current) {
AtomicExchSingleDeviceMultipleKernelTest<TestType, AtomicScopes::device>(2, 1,
sizeof(TestType));
}
DYNAMIC_SECTION("Adjacent addresses " << current) {
AtomicExchSingleDeviceMultipleKernelTest<TestType, AtomicScopes::device>(2, warp_size,
sizeof(TestType));
}
DYNAMIC_SECTION("Scattered addresses " << current) {
AtomicExchSingleDeviceMultipleKernelTest<TestType, AtomicScopes::device>(2, warp_size,
cache_line_size);
}
}
}
/**
* Test Description
* ------------------------
* - RTCs kernels that pass combinations of arguments of invalid types for all overloads of
* atomicExch
* Test source
* ------------------------
* - unit/atomics/atomicExch.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_atomicExch_Negative_Parameters_RTC") {
hiprtcProgram program{};
const auto program_source = GENERATE(kAtomicExchInt, kAtomicExchUnsignedInt, kAtomicExchULL,
kAtomicExchFloat, kAtomicExchDouble);
HIPRTC_CHECK(
hiprtcCreateProgram(&program, program_source, "atomicExch_negative.cc", 0, nullptr, nullptr));
hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
// Get the compile log and count compiler error messages
size_t log_size{};
HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
std::string log(log_size, ' ');
HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
int error_count{0};
int expected_error_count{8};
std::string error_message{"error:"};
size_t n_pos = log.find(error_message, 0);
while (n_pos != std::string::npos) {
++error_count;
n_pos = log.find(error_message, n_pos + 1);
}
HIPRTC_CHECK(hiprtcDestroyProgram(&program));
HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
REQUIRE(error_count == expected_error_count);
}
+381
파일 보기
@@ -0,0 +1,381 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <numeric>
#include <hip_test_common.hh>
#include <resource_guards.hh>
#include <hip/hip_cooperative_groups.h>
#include <cmd_options.hh>
enum class AtomicScopes { device, system };
template <typename T, AtomicScopes scope> __device__ T perform_atomic_exch(T* address, T val) {
if constexpr (scope == AtomicScopes::device) {
return atomicExch(address, val);
} else if (scope == AtomicScopes::system) {
return atomicExch_system(address, val);
}
}
template <typename T, bool use_shared_mem, AtomicScopes scope>
__global__ void atomic_exch_kernel_compile_time(T* const global_mem, T* const old_vals) {
__shared__ T shared_mem;
const auto tid = cooperative_groups::this_grid().thread_rank();
T* const mem = use_shared_mem ? &shared_mem : global_mem;
if constexpr (use_shared_mem) {
if (tid == 0) mem[0] = global_mem[0];
__syncthreads();
}
old_vals[tid] = perform_atomic_exch<T, scope>(mem, static_cast<T>(tid + 1));
if constexpr (use_shared_mem) {
__syncthreads();
if (tid == 0) global_mem[0] = mem[0];
}
}
template <typename T>
__host__ __device__ T* pitched_offset(T* const ptr, const unsigned int pitch,
const unsigned int idx) {
const auto byte_ptr = reinterpret_cast<uint8_t*>(ptr);
return reinterpret_cast<T*>(byte_ptr + idx * pitch);
}
template <typename T, bool use_shared_mem, AtomicScopes scope>
__global__ void atomic_exch_kernel(T* const global_mem, T* const old_vals, const unsigned int width,
const unsigned pitch, const T base_val = 0) {
extern __shared__ uint8_t shared_mem[];
const auto tid = cooperative_groups::this_grid().thread_rank();
T* const mem = use_shared_mem ? reinterpret_cast<T*>(shared_mem) : global_mem;
if constexpr (use_shared_mem) {
if (tid < width) {
const auto target = pitched_offset(mem, pitch, tid);
*target = *pitched_offset(global_mem, pitch, tid);
};
__syncthreads();
}
old_vals[tid] = perform_atomic_exch<T, scope>(pitched_offset(mem, pitch, tid % width),
base_val + static_cast<T>(tid + width));
if constexpr (use_shared_mem) {
__syncthreads();
if (tid < width) {
const auto target = pitched_offset(global_mem, pitch, tid);
*target = *pitched_offset(mem, pitch, tid);
};
}
}
template <typename TestType, bool use_shared_mem, AtomicScopes scope>
void AtomicExchSameAddress(const dim3 blocks, const dim3 threads, const LinearAllocs alloc_type) {
LinearAllocGuard<TestType> mem_dev(alloc_type, sizeof(TestType));
const auto thread_count = blocks.x * blocks.y * blocks.z * threads.x * threads.y * threads.z;
const auto old_vals_alloc_size = thread_count * sizeof(TestType);
LinearAllocGuard<TestType> old_vals_dev(LinearAllocs::hipMalloc, old_vals_alloc_size);
std::vector<TestType> old_vals(thread_count + 1);
HIP_CHECK(hipMemset(mem_dev.ptr(), 0, sizeof(TestType)));
atomic_exch_kernel_compile_time<TestType, use_shared_mem, scope>
<<<blocks, threads>>>(mem_dev.ptr(), old_vals_dev.ptr());
HIP_CHECK(
hipMemcpy(old_vals.data(), old_vals_dev.ptr(), old_vals_alloc_size, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(old_vals.data() + thread_count, mem_dev.ptr(), sizeof(TestType),
hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());
// Every thread will exchange its grid-wide linear id into a target location within mem_dev,
// receiving back the value previously present therein. This previous value is written to
// old_vals_dev.
// old_vals_dev will not contain values that the final scheduled warp exchanged into mem_dev, but
// mem_dev obviously will.
// Given that mem_dev initially contains values in the range [0, width) and that the maximum value
// the final thread shall write is thread_count + width - 1, presuming correct operation of
// atomicExch, the union of mem_dev and old_vals_dev shall contain values in the range
//[0, thread_count + width)
std::sort(old_vals.begin(), old_vals.end());
for (auto i = 0u; i < old_vals.size(); ++i) {
REQUIRE(i == old_vals[i]);
}
}
template <typename TestType, AtomicScopes scope> void AtomicExchSameAddressTest() {
const auto threads = GENERATE(dim3(1024), dim3(1023), dim3(511), dim3(17), dim3(31));
SECTION("Global memory") {
const auto blocks = GENERATE(dim3(20));
using LA = LinearAllocs;
const auto allocation_type =
GENERATE(LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister);
AtomicExchSameAddress<TestType, false, AtomicScopes::device>(blocks, threads, allocation_type);
}
SECTION("Shared memory") {
const auto blocks = dim3(1);
AtomicExchSameAddress<TestType, true, AtomicScopes::device>(blocks, threads,
LinearAllocs::hipMalloc);
}
}
struct AtomicExchParams {
dim3 blocks;
dim3 threads;
unsigned int num_devices = 1u;
unsigned int kernel_count = 1u;
unsigned int width = 1u;
unsigned int pitch = 0u;
unsigned int host_thread_count = 0u;
LinearAllocs alloc_type;
};
template <typename Derived, typename T, bool use_shared_mem, AtomicScopes scope>
class AtomicExchCRTP {
public:
void run(const AtomicExchParams& p) const {
const auto thread_count =
p.blocks.x * p.blocks.y * p.blocks.z * p.threads.x * p.threads.y * p.threads.z;
const auto old_vals_alloc_size = p.kernel_count * thread_count * sizeof(T);
std::vector<LinearAllocGuard<T>> old_vals_devs;
std::vector<StreamGuard> streams;
for (auto i = 0; i < p.num_devices; ++i) {
HIP_CHECK(hipSetDevice(i));
old_vals_devs.emplace_back(LinearAllocs::hipMalloc, old_vals_alloc_size);
for (auto j = 0; j < p.kernel_count; ++j) {
streams.emplace_back(Streams::created);
}
}
const auto mem_alloc_size = p.width * p.pitch;
LinearAllocGuard<T> mem_dev(p.alloc_type, mem_alloc_size);
const auto host_iters_per_thread =
std::max(p.num_devices * p.kernel_count * thread_count / 20, p.width);
std::vector<T> old_vals(p.num_devices * p.kernel_count * thread_count + p.width +
p.host_thread_count * host_iters_per_thread);
std::iota(old_vals.begin(), old_vals.begin() + p.width, 0);
HIP_CHECK(hipMemcpy2D(mem_dev.ptr(), p.pitch, old_vals.data(), sizeof(T), sizeof(T), p.width,
hipMemcpyHostToDevice));
const auto shared_mem_size = use_shared_mem ? mem_alloc_size : 0u;
for (auto i = 0u; i < p.num_devices; ++i) {
const auto device_offset = i * p.kernel_count * thread_count;
for (auto j = 0u; j < p.kernel_count; ++j) {
const auto& stream = streams[i * p.kernel_count + j].stream();
const auto kern_offset = j * thread_count;
const auto old_vals = old_vals_devs[i].ptr() + kern_offset;
CastToDerived().LaunchKernel(shared_mem_size, stream, mem_dev.ptr(), old_vals,
device_offset + kern_offset, p);
}
}
PerformHostAtomicExchange(p.host_thread_count, host_iters_per_thread, mem_dev.host_ptr(),
old_vals.data(), p);
for (auto i = 0u; i < p.num_devices; ++i) {
const auto device_offset = i * p.kernel_count * thread_count;
HIP_CHECK(hipMemcpy(old_vals.data() + device_offset, old_vals_devs[i].ptr(),
old_vals_alloc_size, hipMemcpyDeviceToHost));
}
HIP_CHECK(hipMemcpy2D(old_vals.data() + p.num_devices * p.kernel_count * thread_count,
sizeof(T), mem_dev.ptr(), p.pitch, sizeof(T), p.width,
hipMemcpyDeviceToHost));
CastToDerived().ValidateResults(old_vals);
}
private:
const Derived& CastToDerived() const { return static_cast<const Derived&>(*this); }
static void HostAtomicExchange(const unsigned int iterations, T* mem, T* const old_vals,
const unsigned int width, const unsigned pitch, T base_val) {
for (auto i = 0u; i < iterations; ++i) {
T new_val = base_val + static_cast<T>(i);
T old_val;
__atomic_exchange(pitched_offset(mem, pitch, i % width), &new_val, &old_val,
__ATOMIC_RELAXED);
old_vals[i] = old_val;
}
}
void PerformHostAtomicExchange(const unsigned int thread_count, const unsigned int iterations,
T* mem, T* const old_vals, const AtomicExchParams& p) const {
if (thread_count == 0) {
return;
}
const auto dev_threads =
p.blocks.x * p.blocks.y * p.blocks.z * p.threads.x * p.threads.y * p.threads.z;
const auto host_base_val = p.num_devices * p.kernel_count * dev_threads + p.width;
std::vector<std::thread> threads;
for (auto i = 0u; i < thread_count; ++i) {
const auto thread_base_val = host_base_val + i * iterations;
threads.push_back(std::thread(HostAtomicExchange, iterations, mem, old_vals + thread_base_val,
p.width, p.pitch, thread_base_val));
}
for (auto& th : threads) {
th.join();
}
}
};
template <typename T, bool use_shared_mem, AtomicScopes scope>
class AtomicExch
: public AtomicExchCRTP<AtomicExch<T, use_shared_mem, scope>, T, use_shared_mem, scope> {
public:
void LaunchKernel(const unsigned int shared_mem_size, const hipStream_t stream, T* const mem,
T* const old_vals, const T base_val, const AtomicExchParams& p) const {
atomic_exch_kernel<T, use_shared_mem, scope><<<p.blocks, p.threads, shared_mem_size, stream>>>(
mem, old_vals, p.width, p.pitch, base_val);
}
void ValidateResults(std::vector<T>& old_vals) const {
std::sort(old_vals.begin(), old_vals.end());
for (auto i = 0u; i < old_vals.size(); ++i) {
REQUIRE(i == old_vals[i]);
}
}
};
inline dim3 GenerateAtomicExchThreadDimensions() { return GENERATE(dim3(16), dim3(1024)); }
inline dim3 GenerateAtomicExchBlockDimensions() {
int sm_count = 0;
HIP_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, 0));
return GENERATE_COPY(dim3(sm_count), dim3(sm_count + sm_count / 2));
}
template <typename TestType, AtomicScopes scope>
void AtomicExchSingleDeviceSingleKernelTest(const unsigned int width, const unsigned int pitch) {
AtomicExchParams params;
params.num_devices = 1;
params.kernel_count = 1;
params.threads = GenerateAtomicExchThreadDimensions();
params.width = width;
params.pitch = pitch;
SECTION("Global memory") {
params.blocks = GenerateAtomicExchBlockDimensions();
using LA = LinearAllocs;
for (const auto alloc_type :
{LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
params.alloc_type = alloc_type;
DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
AtomicExch<TestType, false, scope>().run(params);
}
}
}
SECTION("Shared memory") {
params.blocks = dim3(1);
params.alloc_type = LinearAllocs::hipMalloc;
AtomicExch<TestType, true, scope>().run(params);
}
}
template <typename TestType, AtomicScopes scope>
void AtomicExchSingleDeviceMultipleKernelTest(const unsigned int kernel_count,
const unsigned int width, const unsigned int pitch) {
int concurrent_kernels = 0;
HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, 0));
if (!concurrent_kernels) {
HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution");
return;
}
AtomicExchParams params;
params.num_devices = 1;
params.kernel_count = kernel_count;
params.blocks = GenerateAtomicExchBlockDimensions();
params.threads = GenerateAtomicExchThreadDimensions();
params.width = width;
params.pitch = pitch;
using LA = LinearAllocs;
for (const auto alloc_type :
{LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
params.alloc_type = alloc_type;
DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
AtomicExch<TestType, false, scope>().run(params);
}
}
}
template <typename TestType>
void AtomicExchMultipleDeviceMultipleKernelAndHostTest(const unsigned int num_devices,
const unsigned int kernel_count,
const unsigned int width,
const unsigned int pitch,
const unsigned int host_thread_count = 0u) {
if (num_devices > 1) {
if (HipTest::getDeviceCount() < num_devices) {
std::string msg = std::to_string(num_devices) + " devices are required";
HipTest::HIP_SKIP_TEST(msg.c_str());
return;
}
}
if (kernel_count > 1) {
for (auto i = 0u; i < num_devices; ++i) {
int concurrent_kernels = 0;
HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, i));
if (!concurrent_kernels) {
HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution");
return;
}
}
}
AtomicExchParams params;
params.num_devices = num_devices;
params.kernel_count = kernel_count;
params.blocks = GenerateAtomicExchBlockDimensions();
params.threads = GenerateAtomicExchThreadDimensions();
params.width = width;
params.pitch = pitch;
params.host_thread_count = host_thread_count;
using LA = LinearAllocs;
for (const auto alloc_type : {LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) {
params.alloc_type = alloc_type;
DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) {
AtomicExch<TestType, false, AtomicScopes::system>().run(params);
}
}
}
+94
파일 보기
@@ -0,0 +1,94 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
struct Dummy {
__device__ Dummy() {}
__device__ ~Dummy() {}
};
/*int atomicExch(int*, int)*/
__global__ void atomicExch_int_n1(int* p, int v) { atomicExch(p, p); }
__global__ void atomicExch_int_n2(int* p, int v) { atomicExch(&p, v); }
__global__ void atomicExch_int_n3(char* p, int v) { atomicExch(p, v); }
__global__ void atomicExch_int_n4(short* p, int v) { atomicExch(p, v); }
__global__ void atomicExch_int_n5(long* p, int v) { atomicExch(p, v); }
__global__ void atomicExch_int_n6(long long* p, int v) { atomicExch(p, v); }
__global__ void atomicExch_int_n7(Dummy* p, int v) { atomicExch(p, v); }
__global__ void atomicExch_int_n8(int* p, Dummy v) { atomicExch(p, v); }
/*unsigned int atomicExch(unsigned int*, unsigned int)*/
__global__ void atomicExch_unsigned_int_n1(unsigned int* p, unsigned int v) { atomicExch(p, p); }
__global__ void atomicExch_unsigned_int_n2(unsigned int* p, unsigned int v) { atomicExch(&p, v); }
__global__ void atomicExch_unsigned_int_n3(char* p, unsigned int v) { atomicExch(p, v); }
__global__ void atomicExch_unsigned_int_n4(short* p, unsigned int v) { atomicExch(p, v); }
__global__ void atomicExch_unsigned_int_n5(long* p, unsigned int v) { atomicExch(p, v); }
__global__ void atomicExch_unsigned_int_n6(long long* p, unsigned int v) { atomicExch(p, v); }
__global__ void atomicExch_unsigned_int_n7(Dummy* p, unsigned int v) { atomicExch(p, v); }
__global__ void atomicExch_unsigned_int_n8(unsigned int* p, Dummy v) { atomicExch(p, v); }
// /*unsigned long long atomicExch(unsigned long long*, unsigned long long)*/
__global__ void atomicExch_unsigned_long_long_n1(unsigned long long* p, unsigned long long v) {
atomicExch(p, p);
}
__global__ void atomicExch_unsigned_long_long_n2(unsigned long long* p, unsigned long long v) {
atomicExch(&p, v);
}
__global__ void atomicExch_unsigned_long_long_n3(char* p, unsigned long long v) {
atomicExch(p, v);
}
__global__ void atomicExch_unsigned_long_long_n4(short* p, unsigned long long v) {
atomicExch(p, v);
}
__global__ void atomicExch_unsigned_long_long_n5(long* p, unsigned long long v) {
atomicExch(p, v);
}
__global__ void atomicExch_unsigned_long_long_n6(long long* p, unsigned long long v) {
atomicExch(p, v);
}
__global__ void atomicExch_unsigned_long_long_n7(Dummy* p, unsigned long long v) {
atomicExch(p, v);
}
__global__ void atomicExch_unsigned_long_long_n8(unsigned long long* p, Dummy v) {
atomicExch(p, v);
}
// /*float atomicExch(float*, float)*/
__global__ void atomicExch_float_n1(float* p, float v) { atomicExch(p, p); }
__global__ void atomicExch_float_n2(float* p, float v) { atomicExch(&p, v); }
__global__ void atomicExch_float_n3(char* p, float v) { atomicExch(p, v); }
__global__ void atomicExch_float_n4(short* p, float v) { atomicExch(p, v); }
__global__ void atomicExch_float_n5(long* p, float v) { atomicExch(p, v); }
__global__ void atomicExch_float_n6(long long* p, float v) { atomicExch(p, v); }
__global__ void atomicExch_float_n7(Dummy* p, float v) { atomicExch(p, v); }
__global__ void atomicExch_float_n8(float* p, Dummy v) { atomicExch(p, v); }
// /*double atomicExch(double*, double)*/
__global__ void atomicExch_double_n1(double* p, double v) { atomicExch(p, p); }
__global__ void atomicExch_double_n2(double* p, double v) { atomicExch(&p, v); }
__global__ void atomicExch_double_n3(char* p, double v) { atomicExch(p, v); }
__global__ void atomicExch_double_n4(short* p, double v) { atomicExch(p, v); }
__global__ void atomicExch_double_n5(long* p, double v) { atomicExch(p, v); }
__global__ void atomicExch_double_n6(long long* p, double v) { atomicExch(p, v); }
__global__ void atomicExch_double_n7(Dummy* p, double v) { atomicExch(p, v); }
__global__ void atomicExch_double_n8(double* p, Dummy v) { atomicExch(p, v); }
+124
파일 보기
@@ -0,0 +1,124 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
static constexpr auto kAtomicExchInt{
R"(
struct Dummy {
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void atomicExch_int_n1(int* p, int v) { atomicExch(p, p); }
__global__ void atomicExch_int_n2(int* p, int v) { atomicExch(&p, v); }
__global__ void atomicExch_int_n3(char* p, int v) { atomicExch(p, v); }
__global__ void atomicExch_int_n4(short* p, int v) { atomicExch(p, v); }
__global__ void atomicExch_int_n5(long* p, int v) { atomicExch(p, v); }
__global__ void atomicExch_int_n6(long long* p, int v) { atomicExch(p, v); }
__global__ void atomicExch_int_n7(Dummy* p, int v) { atomicExch(p, v); }
__global__ void atomicExch_int_n8(int* p, Dummy v) { atomicExch(p, v); }
)"};
static constexpr auto kAtomicExchUnsignedInt{
R"(
struct Dummy {
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void atomicExch_unsigned_int_n1(unsigned int* p, unsigned int v) { atomicExch(p, p); }
__global__ void atomicExch_unsigned_int_n2(unsigned int* p, unsigned int v) { atomicExch(&p, v); }
__global__ void atomicExch_unsigned_int_n3(char* p, unsigned int v) { atomicExch(p, v); }
__global__ void atomicExch_unsigned_int_n4(short* p, unsigned int v) { atomicExch(p, v); }
__global__ void atomicExch_unsigned_int_n5(long* p, unsigned int v) { atomicExch(p, v); }
__global__ void atomicExch_unsigned_int_n6(long long* p, unsigned int v) { atomicExch(p, v); }
__global__ void atomicExch_unsigned_int_n7(Dummy* p, unsigned int v) { atomicExch(p, v); }
__global__ void atomicExch_unsigned_int_n8(unsigned int* p, Dummy v) { atomicExch(p, v); }
)"};
static constexpr auto kAtomicExchULL{
R"(
struct Dummy {
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void atomicExch_unsigned_long_long_n1(unsigned long long* p, unsigned long long v) {
atomicExch(p, p);
}
__global__ void atomicExch_unsigned_long_long_n2(unsigned long long* p, unsigned long long v) {
atomicExch(&p, v);
}
__global__ void atomicExch_unsigned_long_long_n3(char* p, unsigned long long v) {
atomicExch(p, v);
}
__global__ void atomicExch_unsigned_long_long_n4(short* p, unsigned long long v) {
atomicExch(p, v);
}
__global__ void atomicExch_unsigned_long_long_n5(long* p, unsigned long long v) {
atomicExch(p, v);
}
__global__ void atomicExch_unsigned_long_long_n6(long long* p, unsigned long long v) {
atomicExch(p, v);
}
__global__ void atomicExch_unsigned_long_long_n7(Dummy* p, unsigned long long v) {
atomicExch(p, v);
}
__global__ void atomicExch_unsigned_long_long_n8(unsigned long long* p, Dummy v) {
atomicExch(p, v);
}
)"};
static constexpr auto kAtomicExchFloat{
R"(
struct Dummy {
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void atomicExch_float_n1(float* p, float v) { atomicExch(p, p); }
__global__ void atomicExch_float_n2(float* p, float v) { atomicExch(&p, v); }
__global__ void atomicExch_float_n3(char* p, float v) { atomicExch(p, v); }
__global__ void atomicExch_float_n4(short* p, float v) { atomicExch(p, v); }
__global__ void atomicExch_float_n5(long* p, float v) { atomicExch(p, v); }
__global__ void atomicExch_float_n6(long long* p, float v) { atomicExch(p, v); }
__global__ void atomicExch_float_n7(Dummy* p, float v) { atomicExch(p, v); }
__global__ void atomicExch_float_n8(float* p, Dummy v) { atomicExch(p, v); }
)"};
static constexpr auto kAtomicExchDouble{
R"(
struct Dummy {
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void atomicExch_double_n1(double* p, double v) { atomicExch(p, p); }
__global__ void atomicExch_double_n2(double* p, double v) { atomicExch(&p, v); }
__global__ void atomicExch_double_n3(char* p, double v) { atomicExch(p, v); }
__global__ void atomicExch_double_n4(short* p, double v) { atomicExch(p, v); }
__global__ void atomicExch_double_n5(long* p, double v) { atomicExch(p, v); }
__global__ void atomicExch_double_n6(long long* p, double v) { atomicExch(p, v); }
__global__ void atomicExch_double_n7(Dummy* p, double v) { atomicExch(p, v); }
__global__ void atomicExch_double_n8(double* p, Dummy v) { atomicExch(p, v); }
)"};
+235
파일 보기
@@ -0,0 +1,235 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "atomicExch_common.hh"
#include "atomicExch_system_negative_kernels_rtc.hh"
/**
* @addtogroup atomicExch_system atomicExch_system
* @{
* @ingroup AtomicsTest
*/
/**
* Test Description
* ------------------------
* - Executes a kernel two times concurrently on two devices wherein all threads will perform
* an atomic exchange into a runtime determined memory location. Each thread will exchange its own
* grid wide linear index + offset into the memory location, storing the return value into a
* separate output array slot corresponding to it. Once complete, the union of output array and
* exchange memory is validated to contain all values in the range [0, number_of_threads +
* number_of_exchange_memory_slots). Several memory access patterns are tested:
* -# All threads exchange to a single memory location
* -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
* for indexing
* -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
*
* - The test is run for:
* - All overloads of atomicExch_system
* - hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
* - Several grid and block dimension combinations
* Test source
* ------------------------
* - unit/atomics/atomicExch_system.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
#if HT_NVIDIA
TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Peer_GPUs", "", int, unsigned int,
unsigned long long, float) {
#else
TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Peer_GPUs", "", int, unsigned int,
unsigned long, unsigned long long, float, double) {
#endif // HT_NVIDIA
int warp_size = 0;
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
const auto cache_line_size = 128u;
for (auto current = 0; current < cmd_options.iterations; ++current) {
DYNAMIC_SECTION("Same address " << current) {
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, 1, sizeof(TestType));
}
DYNAMIC_SECTION("Adjacent addresses " << current) {
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, warp_size,
sizeof(TestType));
}
DYNAMIC_SECTION("Scattered addresses " << current) {
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, warp_size, cache_line_size);
}
}
}
/**
* Test Description
* ------------------------
* - Executes a kernel on a single device wherein all threads will perform an atomic exchange
* into a runtime determined memory location. Each thread will exchange its own grid wide linear
* index + offset into the memory location, storing the return value into a separate output array
* slot corresponding to it. While the kernel is running, the host performs atomic exchanges, in 4
* threads, into the same memory location(s). Once complete, the union of output array, exchange
* memory, and host output is validated to contain all values in the range [0, number_of_threads +
* number_of_exchange_memory_slots + number_of_host_iterations). Several memory access patterns are
* tested:
* -# All threads exchange to a single memory location
* -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
* for indexing
* -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
*
* - The test is run for:
* - All overloads of atomicExch_system
* - hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
* - Several grid and block dimension combinations
* Test source
* ------------------------
* - unit/atomics/atomicExch_system.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
#if HT_NVIDIA
TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Host_And_GPU", "", int, unsigned int,
unsigned long long, float) {
#else
TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Host_And_GPU", "", int, unsigned int,
unsigned long, unsigned long long, float, double) {
#endif // HT_NVIDIA
int warp_size = 0;
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
const auto cache_line_size = 128u;
for (auto current = 0; current < cmd_options.iterations; ++current) {
DYNAMIC_SECTION("Same address " << current) {
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(1, 1, 1, sizeof(TestType), 4);
}
DYNAMIC_SECTION("Adjacent addresses " << current) {
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(1, 1, warp_size, sizeof(TestType),
4);
}
DYNAMIC_SECTION("Scattered addresses " << current) {
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(1, 1, warp_size, cache_line_size,
4);
}
}
}
/**
* Test Description
* ------------------------
* - Executes a kernel two times concurrently on two devices wherein all threads will perform
* an atomic exchange into a runtime determined memory location. Each thread will exchange its own
* grid wide linear index + offset into the memory location, storing the return value into a
* separate output array slot corresponding to it. While the kernels are running, the
* host performs atomic exchanges, in 4 threads, into the same memory location(s). Once complete,
* the union of output array, exchange memory, and host output is validated to contain all values in
* the range [0, number_of_threads + number_of_exchange_memory_slots + number_of_host_iterations).
* Several memory access patterns are tested:
* -# All threads exchange to a single memory location
* -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size
* for indexing
* -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes.
*
* - The test is run for:
* - All overloads of atomicExch_system
* - hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory
* - Several grid and block dimension combinations
* Test source
* ------------------------
* - unit/atomics/atomicExch_system.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
#if HT_NVIDIA
TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Host_And_Peer_GPUs", "", int, unsigned int,
unsigned long long, float) {
#else
TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Host_And_Peer_GPUs", "", int, unsigned int,
unsigned long, unsigned long long, float, double) {
#endif // HT_NVIDIA
int warp_size = 0;
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
const auto cache_line_size = 128u;
for (auto current = 0; current < cmd_options.iterations; ++current) {
DYNAMIC_SECTION("Same address " << current) {
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, 1, sizeof(TestType), 4);
}
DYNAMIC_SECTION("Adjacent addresses " << current) {
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, warp_size, sizeof(TestType),
4);
}
DYNAMIC_SECTION("Scattered addresses " << current) {
AtomicExchMultipleDeviceMultipleKernelAndHostTest<TestType>(2, 2, warp_size, cache_line_size,
4);
}
}
}
/**
* Test Description
* ------------------------
* - RTCs kernels that pass combinations of arguments of invalid types for all overloads of
* atomicExch_system
* Test source
* ------------------------
* - unit/atomics/atomicExch_system.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_atomicExch_system_Negative_Parameters_RTC") {
hiprtcProgram program{};
const auto program_source =
GENERATE(kAtomicExchSystemInt, kAtomicExchSystemUnsignedInt, kAtomicExchSystemULL,
kAtomicExchSystemFloat, kAtomicExchSystemDouble);
HIPRTC_CHECK(
hiprtcCreateProgram(&program, program_source, "atomicExch_negative.cc", 0, nullptr, nullptr));
hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)};
// Get the compile log and count compiler error messages
size_t log_size{};
HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size));
std::string log(log_size, ' ');
HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data()));
int error_count{0};
int expected_error_count{8};
std::string error_message{"error:"};
size_t n_pos = log.find(error_message, 0);
while (n_pos != std::string::npos) {
++error_count;
n_pos = log.find(error_message, n_pos + 1);
}
HIPRTC_CHECK(hiprtcDestroyProgram(&program));
HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION);
REQUIRE(error_count == expected_error_count);
}
+112
파일 보기
@@ -0,0 +1,112 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
struct Dummy {
__device__ Dummy() {}
__device__ ~Dummy() {}
};
/*int atomicExch_system(int*, int)*/
__global__ void atomicExch_system_int_n1(int* p, int v) { atomicExch_system(p, p); }
__global__ void atomicExch_system_int_n2(int* p, int v) { atomicExch_system(&p, v); }
__global__ void atomicExch_system_int_n3(char* p, int v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_int_n4(short* p, int v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_int_n5(long* p, int v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_int_n6(long long* p, int v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_int_n7(Dummy* p, int v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_int_n8(int* p, Dummy v) { atomicExch_system(p, v); }
/*unsigned int atomicExch_system(unsigned int*, unsigned int)*/
__global__ void atomicExch_system_unsigned_int_n1(unsigned int* p, unsigned int v) {
atomicExch_system(p, p);
}
__global__ void atomicExch_system_unsigned_int_n2(unsigned int* p, unsigned int v) {
atomicExch_system(&p, v);
}
__global__ void atomicExch_system_unsigned_int_n3(char* p, unsigned int v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_int_n4(short* p, unsigned int v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_int_n5(long* p, unsigned int v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_int_n6(long long* p, unsigned int v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_int_n7(Dummy* p, unsigned int v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_int_n8(unsigned int* p, Dummy v) {
atomicExch_system(p, v);
}
// /*unsigned long long atomicExch_system(unsigned long long*, unsigned long long)*/
__global__ void atomicExch_system_unsigned_long_long_n1(unsigned long long* p,
unsigned long long v) {
atomicExch_system(p, p);
}
__global__ void atomicExch_system_unsigned_long_long_n2(unsigned long long* p,
unsigned long long v) {
atomicExch_system(&p, v);
}
__global__ void atomicExch_system_unsigned_long_long_n3(char* p, unsigned long long v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_long_long_n4(short* p, unsigned long long v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_long_long_n5(long* p, unsigned long long v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_long_long_n6(long long* p, unsigned long long v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_long_long_n7(Dummy* p, unsigned long long v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_long_long_n8(unsigned long long* p, Dummy v) {
atomicExch_system(p, v);
}
// /*float atomicExch_system(float*, float)*/
__global__ void atomicExch_system_float_n1(float* p, float v) { atomicExch_system(p, p); }
__global__ void atomicExch_system_float_n2(float* p, float v) { atomicExch_system(&p, v); }
__global__ void atomicExch_system_float_n3(char* p, float v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_float_n4(short* p, float v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_float_n5(long* p, float v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_float_n6(long long* p, float v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_float_n7(Dummy* p, float v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_float_n8(float* p, Dummy v) { atomicExch_system(p, v); }
// /*double atomicExch_system(double*, double)*/
__global__ void atomicExch_system_double_n1(double* p, double v) { atomicExch_system(p, p); }
__global__ void atomicExch_system_double_n2(double* p, double v) { atomicExch_system(&p, v); }
__global__ void atomicExch_system_double_n3(char* p, double v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_double_n4(short* p, double v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_double_n5(long* p, double v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_double_n6(long long* p, double v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_double_n7(Dummy* p, double v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_double_n8(double* p, Dummy v) { atomicExch_system(p, v); }
@@ -0,0 +1,142 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
static constexpr auto kAtomicExchSystemInt{
R"(
struct Dummy {
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void atomicExch_system_int_n1(int* p, int v) { atomicExch_system(p, p); }
__global__ void atomicExch_system_int_n2(int* p, int v) { atomicExch_system(&p, v); }
__global__ void atomicExch_system_int_n3(char* p, int v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_int_n4(short* p, int v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_int_n5(long* p, int v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_int_n6(long long* p, int v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_int_n7(Dummy* p, int v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_int_n8(int* p, Dummy v) { atomicExch_system(p, v); }
)"};
static constexpr auto kAtomicExchSystemUnsignedInt{
R"(
struct Dummy {
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void atomicExch_system_unsigned_int_n1(unsigned int* p, unsigned int v) {
atomicExch_system(p, p);
}
__global__ void atomicExch_system_unsigned_int_n2(unsigned int* p, unsigned int v) {
atomicExch_system(&p, v);
}
__global__ void atomicExch_system_unsigned_int_n3(char* p, unsigned int v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_int_n4(short* p, unsigned int v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_int_n5(long* p, unsigned int v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_int_n6(long long* p, unsigned int v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_int_n7(Dummy* p, unsigned int v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_int_n8(unsigned int* p, Dummy v) {
atomicExch_system(p, v);
}
)"};
static constexpr auto kAtomicExchSystemULL{
R"(
struct Dummy {
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void atomicExch_system_unsigned_long_long_n1(unsigned long long* p,
unsigned long long v) {
atomicExch_system(p, p);
}
__global__ void atomicExch_system_unsigned_long_long_n2(unsigned long long* p,
unsigned long long v) {
atomicExch_system(&p, v);
}
__global__ void atomicExch_system_unsigned_long_long_n3(char* p, unsigned long long v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_long_long_n4(short* p, unsigned long long v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_long_long_n5(long* p, unsigned long long v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_long_long_n6(long long* p, unsigned long long v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_long_long_n7(Dummy* p, unsigned long long v) {
atomicExch_system(p, v);
}
__global__ void atomicExch_system_unsigned_long_long_n8(unsigned long long* p, Dummy v) {
atomicExch_system(p, v);
}
)"};
static constexpr auto kAtomicExchSystemFloat{
R"(
struct Dummy {
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void atomicExch_system_float_n1(float* p, float v) { atomicExch_system(p, p); }
__global__ void atomicExch_system_float_n2(float* p, float v) { atomicExch_system(&p, v); }
__global__ void atomicExch_system_float_n3(char* p, float v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_float_n4(short* p, float v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_float_n5(long* p, float v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_float_n6(long long* p, float v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_float_n7(Dummy* p, float v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_float_n8(float* p, Dummy v) { atomicExch_system(p, v); }
)"};
static constexpr auto kAtomicExchSystemDouble{
R"(
struct Dummy {
__device__ Dummy() {}
__device__ ~Dummy() {}
};
__global__ void atomicExch_system_double_n1(double* p, double v) { atomicExch_system(p, p); }
__global__ void atomicExch_system_double_n2(double* p, double v) { atomicExch_system(&p, v); }
__global__ void atomicExch_system_double_n3(char* p, double v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_double_n4(short* p, double v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_double_n5(long* p, double v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_double_n6(long long* p, double v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_double_n7(Dummy* p, double v) { atomicExch_system(p, v); }
__global__ void atomicExch_system_double_n8(double* p, Dummy v) { atomicExch_system(p, v); }
)"};
+107
파일 보기
@@ -0,0 +1,107 @@
# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import subprocess
import sys
import unittest
class CompileAndCapture(unittest.TestCase):
path = None
expected_error_count = 0
expected_warning_count = 0
hip_path = None
file = None
error_string = None
warning_string = None
platform = None
def setUp(self):
self.error_string = 'error:'
self.warning_string = 'warning:'
self.assertFalse(self.hip_path == None)
self.assertFalse(self.path == None)
self.assertFalse(self.file == None)
self.assertTrue(self.platform == 'amd' or self.platform == 'nvidia')
def test(self):
compiler_args = [
self.hip_path + '/bin/hipcc',
'-I' + self.path + '/../../external/Catch2',
'-I' + self.path + '/../../include',
'-I' + self.path + '/../../external/picojson',
'-c',
self.path + '/' + self.file,
]
# HIP compiler on AMD platforms has limit of 20 errors, and some negative
# test cases expect that more errors are detected.
if (self.platform == 'amd'):
compiler_args.append('-ferror-limit=100')
compiler_output = subprocess.run(compiler_args, stderr=subprocess.PIPE)
# Get the compiler output in the stdout if -V flag is raised during ctest invocation.
compiler_stderr = compiler_output.stderr.decode('UTF-8')
print(compiler_stderr)
error_count = compiler_stderr.count(self.error_string)
if self.expected_error_count < 0:
self.assertGreater(error_count, 0)
else:
self.assertEqual(error_count, self.expected_error_count)
warning_count = compiler_stderr.count(self.warning_string)
if self.expected_warning_count < 0:
self.assertGreater(warning_count, 0)
else:
self.assertEqual(warning_count, self.expected_warning_count)
if __name__ == '__main__':
try:
CompileAndCapture.path = sys.argv[1]
except IndexError:
CompileAndCapture.path = None
try:
CompileAndCapture.platform = sys.argv[2]
except IndexError:
CompileAndCapture.platform = None
try:
CompileAndCapture.hip_path = sys.argv[3]
except IndexError:
CompileAndCapture.hip_path = None
try:
CompileAndCapture.file = sys.argv[4]
except IndexError:
CompileAndCapture.file = None
try:
CompileAndCapture.expected_error_count = int(sys.argv[5])
except IndexError:
CompileAndCapture.expected_error_count = 0
try:
CompileAndCapture.expected_warning_count = int(sys.argv[6])
except IndexError:
CompileAndCapture.expected_warning_count = 0
# Unittest looks at the same argv's as the __main__ and doesn't know how
# to handle arguments other than the executable (0). Therefore passing only
# executable as the argv for unittest module.
unittest.main(argv=[sys.argv[0]])
+11 -12
파일 보기
@@ -1,26 +1,25 @@
# Common Tests - Test independent of all platforms
set(TEST_SRC
hipCGThreadBlockType.cc
hipCGThreadBlockTypeViaBaseType.cc
hipCGThreadBlockTypeViaPublicApi.cc
hipCGMultiGridGroupType.cc
hipCGMultiGridGroupTypeViaBaseType.cc
hipCGMultiGridGroupTypeViaPublicApi.cc
hipCGThreadBlockType_old.cc
hipCGMultiGridGroupType_old.cc
hipCGGridGroupType_old.cc
hipCGTiledPartitionType_old.cc
hipCGThreadBlockTileTypeShfl_old.cc
hipCGCoalescedGroups_old.cc
hipLaunchCooperativeKernel_old.cc
hipLaunchCooperativeKernelMultiDevice_old.cc
grid_group.cc
coalesced_groups_shfl_down.cc
coalesced_groups_shfl_up.cc
hipCGTiledPartition.cc
hipCGCoalescedGroups.cc
coalesced_tiled_groups_metagrp.cc
)
if(HIP_PLATFORM STREQUAL "nvidia")
set_source_files_properties(hipCGMultiGridGroupType.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
set_source_files_properties(hipCGMultiGridGroupTypeViaBaseType.cc PROPERTIES COMPILE_FLAGS "-D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
set_source_files_properties(hipCGMultiGridGroupTypeViaPublicApi.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
set_source_files_properties(hipCGMultiGridGroupType_old.cc PROPERTIES COMPILE_FLAGS "-D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
set_source_files_properties(hipLaunchCooperativeKernelMultiDevice_old.cc PROPERTIES COMPILE_FLAGS "-D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
hip_add_exe_to_target(NAME coopGrpTest
TEST_SRC ${TEST_SRC}
TEST_TARGET_NAME build_tests
LINKER_LIBS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80")
LINKER_LIBS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80, -gencode arch=compute_86,code=sm_86, -gencode=arch=compute_86,code=compute_86")
else()
hip_add_exe_to_target(NAME coopGrpTest
TEST_SRC ${TEST_SRC}
+496
파일 보기
@@ -0,0 +1,496 @@
/*
Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
#include "hip_cg_common.hh"
namespace cg = cooperative_groups;
static __device__ int gm[2];
static __global__ void kernel_cg_grid_group_type(int* size_dev, int* thd_rank_dev,
int* is_valid_dev, int* sync_dev) {
cg::grid_group gg = cg::this_grid();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test size
size_dev[gIdx] = gg.size();
// Test thread_rank
thd_rank_dev[gIdx] = gg.thread_rank();
// Test is_valid
is_valid_dev[gIdx] = gg.is_valid();
// Test sync
if (blockIdx.x == 0 && threadIdx.x == 0)
gm[0] = 10;
else if (blockIdx.x == 1 && threadIdx.x == 0)
gm[1] = 20;
gg.sync();
sync_dev[gIdx] = gm[1] * gm[0];
}
static __global__ void kernel_cg_grid_group_type_via_base_type(int* size_dev, int* thd_rank_dev,
int* is_valid_dev, int* sync_dev) {
cg::thread_group tg = cg::this_grid();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test size
size_dev[gIdx] = tg.size();
// Test thread_rank
thd_rank_dev[gIdx] = tg.thread_rank();
// Test is_valid
#ifdef __HIP_PLATFORM_AMD__
is_valid_dev[gIdx] = tg.is_valid();
#else
// Cuda has no thread_group.is_valid()
is_valid_dev[gIdx] = true;
#endif
// Test sync
if (blockIdx.x == 0 && threadIdx.x == 0)
gm[0] = 10;
else if (blockIdx.x == 1 && threadIdx.x == 0)
gm[1] = 20;
tg.sync();
sync_dev[gIdx] = gm[1] * gm[0];
}
static __global__ void kernel_cg_grid_group_type_via_public_api(int* size_dev, int* thd_rank_dev,
int* is_valid_dev, int* sync_dev) {
cg::grid_group gg = cg::this_grid();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test group_size api
size_dev[gIdx] = cg::group_size(gg);
// Test thread_rank api
thd_rank_dev[gIdx] = cg::thread_rank(gg);
// Test is_valid api
is_valid_dev[gIdx] = gg.is_valid();
// Test sync
if (blockIdx.x == 0 && threadIdx.x == 0)
gm[0] = 10;
else if (blockIdx.x == 1 && threadIdx.x == 0)
gm[1] = 20;
cg::sync(gg);
sync_dev[gIdx] = gm[1] * gm[0];
}
static __global__ void coop_kernel(unsigned int* first_array, unsigned int* second_array,
unsigned int loops, unsigned int array_len) {
cg::grid_group grid = cg::this_grid();
unsigned int rank = grid.thread_rank();
unsigned int grid_size = grid.size();
for (int i = 0; i < loops; i++) {
// The goal of this loop is to directly add in values from
// array one into array two, on a per-wave basis.
for (int offset = rank; offset < array_len; offset += grid_size) {
second_array[offset] += first_array[offset];
}
grid.sync();
// The goal of this loop is to pull data the "mirror" lane in
// array two and add it back into array one. This causes inter-
// thread swizzling.
for (int offset = rank; offset < array_len; offset += grid_size) {
unsigned int swizzle_offset = array_len - offset - 1;
first_array[offset] += second_array[swizzle_offset];
}
grid.sync();
}
}
static __global__ void test_kernel(unsigned int* atomic_val, unsigned int* array,
unsigned int loops) {
cg::grid_group grid = cg::this_grid();
unsigned rank = grid.thread_rank();
int offset = blockIdx.x;
for (int i = 0; i < loops; i++) {
// Make the last thread run way behind everyone else.
// If the barrier below fails, then the other threads may hit the
// atomicInc instruction many times before the last thread ever gets to it.
// As such, without the barrier, the last array entry will eventually
// contain a very large value, defined by however many times the other
// wavefronts make it through this loop.
// If the barrier works, then it will likely contain some number
// near "total number of blocks". It will be the last wavefront to
// reach the atomicInc, but everyone will have only hit the atomic once.
if (rank == (grid.size() - 1)) {
long long time_diff = 0;
long long last_clock = clock64();
do {
long long cur_clock = clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < 1000000);
}
if (threadIdx.x == 0) {
array[offset] = atomicInc(&atomic_val[0], UINT_MAX);
}
grid.sync();
offset += gridDim.x;
}
}
__global__ void test_kernel_gfx11(unsigned int* atomic_val, unsigned int* array,
unsigned int loops) {
#if HT_AMD
cg::grid_group grid = cg::this_grid();
unsigned rank = grid.thread_rank();
int offset = blockIdx.x;
for (int i = 0; i < loops; i++) {
// Make the last thread run way behind everyone else.
// If the barrier below fails, then the other threads may hit the
// atomicInc instruction many times before the last thread ever gets
// to it.
// As such, without the barrier, the last array entry will eventually
// contain a very large value, defined by however many times the other
// wavefronts make it through this loop.
// If the barrier works, then it will likely contain some number
// near "total number of blocks". It will be the last wavefront to
// reach the atomicInc, but everyone will have only hit the atomic once.
if (rank == (grid.size() - 1)) {
long long time_diff = 0;
long long last_clock = wall_clock64();
do {
long long cur_clock = wall_clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < 1000000);
}
if (threadIdx.x == 0) {
array[offset] = atomicInc(&atomic_val[0], UINT_MAX);
}
grid.sync();
offset += gridDim.x;
}
#endif
}
static void verify_coop_buffers(unsigned int* host_input, unsigned int* first_array,
unsigned int* second_array, unsigned int loops,
unsigned int array_len) {
unsigned int* expected_first_array = host_input;
unsigned int* expected_second_array =
reinterpret_cast<unsigned int*>(malloc(sizeof(unsigned int) * array_len));
memset(expected_second_array, 0, sizeof(unsigned int) * array_len);
for (int i = 0; i < loops; i++) {
for (int offset = 0; offset < array_len; offset++) {
expected_second_array[offset] += expected_first_array[offset];
}
for (int offset = 0; offset < array_len; offset++) {
unsigned int swizzle_offset = array_len - offset - 1;
expected_first_array[offset] += expected_second_array[swizzle_offset];
}
}
for (int i = 0; i < array_len; i++) {
REQUIRE(first_array[i] == expected_first_array[i]);
REQUIRE(second_array[i] == expected_second_array[i]);
}
free(expected_second_array);
}
static void verify_barrier_buffer(unsigned int loops, unsigned int warps,
unsigned int* host_buffer) {
unsigned int max_in_this_loop = 0;
for (unsigned int i = 0; i < loops; i++) {
max_in_this_loop += warps;
for (unsigned int j = 0; j < warps; j++) {
REQUIRE(host_buffer[i * warps + j] <= max_in_this_loop);
}
}
}
template <typename F> static void test_cg_grid_group_type(F kernel_func, int block_size) {
int num_bytes = sizeof(int) * 2 * block_size;
int *size_dev, *size_host;
int *thd_rank_dev, *thd_rank_host;
int *is_valid_dev, *is_valid_host;
int *sync_dev, *sync_host;
// Allocate device memory
HIP_CHECK(hipMalloc(&size_dev, num_bytes));
HIP_CHECK(hipMalloc(&thd_rank_dev, num_bytes));
HIP_CHECK(hipMalloc(&is_valid_dev, num_bytes));
HIP_CHECK(hipMalloc(&sync_dev, num_bytes));
// Allocate host memory
HIP_CHECK(hipHostMalloc(&size_host, num_bytes));
HIP_CHECK(hipHostMalloc(&thd_rank_host, num_bytes));
HIP_CHECK(hipHostMalloc(&is_valid_host, num_bytes));
HIP_CHECK(hipHostMalloc(&sync_host, num_bytes));
// Launch Kernel
void* params[4];
params[0] = &size_dev;
params[1] = &thd_rank_dev;
params[2] = &is_valid_dev;
params[3] = &sync_dev;
HIP_CHECK(hipLaunchCooperativeKernel(kernel_func, 2, block_size, params, 0, 0));
// Copy result from device to host
HIP_CHECK(hipMemcpy(size_host, size_dev, num_bytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(thd_rank_host, thd_rank_dev, num_bytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(is_valid_host, is_valid_dev, num_bytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(sync_host, sync_dev, num_bytes, hipMemcpyDeviceToHost));
// Validate results for both blocks together
for (int i = 0; i < 2 * block_size; ++i) {
ASSERT_EQUAL(size_host[i], 2 * block_size);
ASSERT_EQUAL(thd_rank_host[i], i);
ASSERT_EQUAL(is_valid_host[i], 1);
ASSERT_EQUAL(sync_host[i], 200);
}
// Free device memory
HIP_CHECK(hipFree(size_dev));
HIP_CHECK(hipFree(thd_rank_dev));
HIP_CHECK(hipFree(is_valid_dev));
HIP_CHECK(hipFree(sync_dev));
// Free host memory
HIP_CHECK(hipHostFree(size_host));
HIP_CHECK(hipHostFree(thd_rank_host));
HIP_CHECK(hipHostFree(is_valid_host));
HIP_CHECK(hipHostFree(sync_host));
}
TEST_CASE("Unit_hipCGGridGroupType_Basic") {
// Use default device for validating the test
int device;
hipDeviceProp_t device_properties;
HIP_CHECK(hipGetDevice(&device));
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
if (!device_properties.cooperativeLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
void* (*kernel_func)(void);
SECTION("Default grid group API test") {
kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_grid_group_type);
}
#if HT_AMD
SECTION("Base type grid group API test") {
kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_grid_group_type_via_base_type);
}
#endif
SECTION("Public API grid group test") {
kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_grid_group_type_via_public_api);
}
// Test for block_size in powers of 2
int max_threads_per_blk = device_properties.maxThreadsPerBlock;
for (int block_size = 2; block_size <= max_threads_per_blk; block_size = block_size * 2) {
test_cg_grid_group_type(kernel_func, block_size);
}
// Test for random blockSizes, but the sequence is the same every execution
srand(0);
for (int i = 0; i < 10; i++) {
// Test fails for only 1 thread per block
test_cg_grid_group_type(kernel_func, max(2, rand() % max_threads_per_blk));
}
}
TEST_CASE("Unit_hipCGGridGroupType_DataSharing") {
const auto device = GENERATE(range(0, HipTest::getDeviceCount()));
HIP_CHECK(hipSetDevice(device));
hipDeviceProp_t device_properties;
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
if (!device_properties.cooperativeLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
int loops = GENERATE(1, 2, 3, 4);
int width = GENERATE(512, 1024, 2048, 4096);
// Launch enough waves to fill up all of the GPU
int warp_size = device_properties.warpSize;
int num_sms = device_properties.multiProcessorCount;
// Calculate the device occupancy to know how many blocks can be run.
int max_blocks_per_sm;
HIP_CHECK(
hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, coop_kernel, warp_size, 0));
int num_blocks = max_blocks_per_sm * num_sms;
// Create Streams
hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
// Allocate and initialize data
// Alocate the host input buffer, and two device buffers
unsigned int* input_buffer =
reinterpret_cast<unsigned int*>(malloc(sizeof(unsigned int) * width));
for (int i = 0; i < width; i++) {
input_buffer[i] = i;
}
unsigned int *dev_mem_1, *host_mem_1;
host_mem_1 = reinterpret_cast<unsigned int*>(malloc(sizeof(unsigned int) * width));
HIP_CHECK(hipMalloc(&dev_mem_1, sizeof(unsigned int) * width));
HIP_CHECK(hipMemcpyAsync(dev_mem_1, input_buffer, sizeof(unsigned int) * width,
hipMemcpyHostToDevice, stream));
unsigned int *dev_mem_2, *host_mem_2;
host_mem_2 = reinterpret_cast<unsigned int*>(malloc(sizeof(unsigned int) * width));
HIP_CHECK(hipMalloc(&dev_mem_2, sizeof(unsigned int) * width));
HIP_CHECK(hipMemsetAsync(dev_mem_2, 0, width * sizeof(unsigned int), stream));
// Launch the kernels
INFO("Launching a cooperative kernel with " << num_blocks << " blocks, each with " << warp_size
<< " threads");
void* coop_params[4];
coop_params[0] = reinterpret_cast<void*>(&dev_mem_1);
coop_params[1] = reinterpret_cast<void*>(&dev_mem_2);
coop_params[2] = reinterpret_cast<void*>(&loops);
coop_params[3] = reinterpret_cast<void*>(&width);
HIP_CHECK(hipLaunchCooperativeKernel(coop_kernel, num_blocks, warp_size, coop_params, 0, stream));
// Read back the buffers and print out their data
HIP_CHECK(hipMemcpyAsync(host_mem_1, dev_mem_1, sizeof(unsigned int) * width,
hipMemcpyDeviceToHost, stream));
HIP_CHECK(hipMemcpyAsync(host_mem_2, dev_mem_2, sizeof(unsigned int) * width,
hipMemcpyDeviceToHost, stream));
HIP_CHECK(hipStreamSynchronize(stream));
verify_coop_buffers(input_buffer, host_mem_1, host_mem_2, loops, width);
HIP_CHECK(hipStreamDestroy(stream));
HIP_CHECK(hipFree(dev_mem_1));
HIP_CHECK(hipFree(dev_mem_2));
free(input_buffer);
free(host_mem_1);
free(host_mem_2);
}
TEST_CASE("Unit_hipCGGridGroupType_Barrier") {
const auto device = GENERATE(range(0, HipTest::getDeviceCount()));
HIP_CHECK(hipSetDevice(device));
hipDeviceProp_t device_properties;
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
if (!device_properties.cooperativeLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
uint32_t loops = GENERATE(1, 2, 3, 4);
uint32_t warps = GENERATE(4, 8, 16, 32);
uint32_t block_size = 1;
// Test whether the requested size will fit on the GPU
int max_blocks_per_sm;
int warp_size = device_properties.warpSize;
int num_sms = device_properties.multiProcessorCount;
int num_threads_in_block = block_size * warp_size;
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
// Calculate the device occupancy to know how many blocks can be run.
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, test_kernel_used,
num_threads_in_block, 0));
int requested_blocks = warps / block_size;
if (requested_blocks > max_blocks_per_sm * num_sms) {
INFO("Too many blocks requested!");
REQUIRE(false);
}
// Each block will output a single value per loop.
uint32_t total_buffer_len = requested_blocks * loops;
// Alocate the buffer that will hold the kernel's output, and which will
// also be used to globally synchronize during GWS initialization
unsigned int* host_buffer =
reinterpret_cast<unsigned int*>(calloc(total_buffer_len, sizeof(unsigned int)));
unsigned int* kernel_buffer;
HIP_CHECK(hipMalloc(&kernel_buffer, sizeof(unsigned int) * total_buffer_len));
HIP_CHECK(hipMemcpy(kernel_buffer, host_buffer, sizeof(unsigned int) * total_buffer_len,
hipMemcpyHostToDevice));
unsigned int* kernel_atomic;
HIP_CHECK(hipMalloc(&kernel_atomic, sizeof(unsigned int)));
HIP_CHECK(hipMemset(kernel_atomic, 0, sizeof(unsigned int)));
// Launch the kernel
INFO("Launching a cooperative kernel with " << warps << " warps in " << requested_blocks
<< " thread blocks");
void* params[3];
params[0] = reinterpret_cast<void*>(&kernel_atomic);
params[1] = reinterpret_cast<void*>(&kernel_buffer);
params[2] = reinterpret_cast<void*>(&loops);
HIP_CHECK(hipLaunchCooperativeKernel(test_kernel_used, requested_blocks, num_threads_in_block,
params, 0, 0));
// Read back the buffer to host
HIP_CHECK(hipMemcpy(host_buffer, kernel_buffer, sizeof(unsigned int) * total_buffer_len,
hipMemcpyDeviceToHost));
verify_barrier_buffer(loops, requested_blocks, host_buffer);
HIP_CHECK(hipFree(kernel_buffer));
HIP_CHECK(hipFree(kernel_atomic));
free(host_buffer);
}
-240
파일 보기
@@ -1,240 +0,0 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
* TEST: %t
* HIT_END
*/
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
using namespace cooperative_groups;
constexpr int MaxGPUs = 8;
static __global__
void kernel_cg_multi_grid_group_type(int* numGridsTestD,
int* gridRankTestD,
int *sizeTestD,
int *thdRankTestD,
int *isValidTestD,
int *syncTestD,
int *syncResultD)
{
multi_grid_group mg = this_multi_grid();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test num_grids
numGridsTestD[gIdx] = mg.num_grids();
// Test grid_rank
gridRankTestD[gIdx] = mg.grid_rank();
// Test size
sizeTestD[gIdx] = mg.size();
// Test thread_rank
thdRankTestD[gIdx] = mg.thread_rank();
// Test is_valid
isValidTestD[gIdx] = mg.is_valid();
// Test sync
//
// Eech thread assign 1 to their respective location
syncTestD[gIdx] = 1;
// Grid level sync
this_grid().sync();
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
syncTestD[0] += syncTestD[i];
}
syncResultD[mg.grid_rank() + 1] = syncTestD[0];
}
// multi-grid level sync
mg.sync();
// grid (gpu) 0 does final reduction across all grids (gpus)
if (mg.grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
syncResultD[0] = 0;
for (uint i = 1; i <= mg.num_grids(); ++i) {
syncResultD[0] += syncResultD[i];
}
}
}
static void test_cg_multi_grid_group_type(int blockSize, int nGpu)
{
// Create a stream each device
hipStream_t stream[MaxGPUs];
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device
HIPCHECK(hipStreamCreate(&stream[i]));
}
// Allocate host and device memory
int nBytes = sizeof(int) * 2 * blockSize;
int *numGridsTestD[MaxGPUs], *numGridsTestH[MaxGPUs];
int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
int *syncTestD[MaxGPUs], *syncResultD;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMalloc(&numGridsTestD[i], nBytes));
HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
HIPCHECK(hipHostMalloc(&numGridsTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
if (i == 0) {
HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
}
}
// Launch Kernel
constexpr int NumKernelArgs = 7;
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
void* args[MaxGPUs * NumKernelArgs];
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
args[i * NumKernelArgs] = &numGridsTestD[i];
args[i * NumKernelArgs + 1] = &gridRankTestD[i];
args[i * NumKernelArgs + 2] = &sizeTestD[i];
args[i * NumKernelArgs + 3] = &thdRankTestD[i];
args[i * NumKernelArgs + 4] = &isValidTestD[i];
args[i * NumKernelArgs + 5] = &syncTestD[i];
args[i * NumKernelArgs + 6] = &syncResultD;
launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type);
launchParamsList[i].gridDim = 2;
launchParamsList[i].blockDim = blockSize;
launchParamsList[i].sharedMem = 0;
launchParamsList[i].stream = stream[i];
launchParamsList[i].args = &args[i * NumKernelArgs];
}
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
// Copy result from device to host
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMemcpy(numGridsTestH[i], numGridsTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
}
// Validate results
int gridsSeen[MaxGPUs];
for (int i = 0; i < nGpu; ++i) {
for (int j = 0; j < 2 * blockSize; ++j) {
ASSERT_EQUAL(numGridsTestH[i][j], nGpu);
ASSERT_GE(gridRankTestH[i][j], 0);
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
int gridRank = gridRankTestH[i][j];
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
ASSERT_EQUAL(isValidTestH[i][j], 1);
}
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
// Validate uniqueness property of grid rank
gridsSeen[i] = gridRankTestH[i][0];
for (int k = 0; k < i; ++k) {
if (gridsSeen[k] == gridsSeen[i]) {
assert(false && "Grid rank in multi-gpu setup should be unique");
}
}
}
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
// Free host and device memory
delete [] launchParamsList;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipFree(numGridsTestD[i]));
HIPCHECK(hipFree(gridRankTestD[i]));
HIPCHECK(hipFree(sizeTestD[i]));
HIPCHECK(hipFree(thdRankTestD[i]));
HIPCHECK(hipFree(isValidTestD[i]));
HIPCHECK(hipFree(syncTestD[i]));
if (i == 0) {
HIPCHECK(hipHostFree(syncResultD));
}
HIPCHECK(hipHostFree(numGridsTestH[i]));
HIPCHECK(hipHostFree(gridRankTestH[i]));
HIPCHECK(hipHostFree(sizeTestH[i]));
HIPCHECK(hipHostFree(thdRankTestH[i]));
HIPCHECK(hipHostFree(isValidTestH[i]));
}
}
TEST_CASE("Unit_hipCGMultiGridGroupType") {
int nGpu = 0;
HIPCHECK(hipGetDeviceCount(&nGpu));
nGpu = min(nGpu, MaxGPUs);
// Set `maxThreadsPerBlock` by taking minimum among all available devices
int maxThreadsPerBlock = INT_MAX;
hipDeviceProp_t deviceProperties;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
if (!deviceProperties.cooperativeMultiDeviceLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
}
// Test for blockSizes in powers of 2
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
test_cg_multi_grid_group_type(blockSize, nGpu);
}
// Test for random blockSizes, but the sequence is the same every execution
srand(0);
for (int i = 0; i < 10; i++) {
// Test fails for 0 thread per block
test_cg_multi_grid_group_type(max(2, rand() % maxThreadsPerBlock), nGpu);
}
}
@@ -1,234 +0,0 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
* TEST: %t
* HIT_END
*/
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
#include <cmath>
#include <cstdlib>
#include <climits>
#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
using namespace cooperative_groups;
constexpr int MaxGPUs = 8;
static __global__
void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD,
int* gridRankTestD,
int *thdRankTestD,
int *isValidTestD,
int *syncTestD,
int *syncResultD)
{
thread_group tg = this_multi_grid(); // This can work if _CG_ABI_EXPERIMENTAL defined on Cuda
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test size
sizeTestD[gIdx] = tg.size();
// Test thread_rank
gridRankTestD[gIdx] = this_multi_grid().grid_rank();
thdRankTestD[gIdx] = tg.thread_rank();
// Test is_valid
#ifdef __HIP_PLATFORM_AMD__
isValidTestD[gIdx] = tg.is_valid();
#else
// Cuda has no thread_group.is_valid()
isValidTestD[gIdx] = true;
#endif
// Test sync
//
// Eech thread assign 1 to their respective location
syncTestD[gIdx] = 1;
// Grid level sync
this_grid().sync();
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
syncTestD[0] += syncTestD[i];
}
syncResultD[this_multi_grid().grid_rank() + 1] = syncTestD[0];
}
// multi-grid level sync
tg.sync();
// grid (gpu) 0 does final reduction across all grids (gpus)
if (this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
syncResultD[0] = 0;
for (uint i = 1; i <= this_multi_grid().num_grids(); ++i) {
syncResultD[0] += syncResultD[i];
}
}
}
static void test_cg_multi_grid_group_type_via_base_type(int blockSize, int nGpu)
{
// Create a stream each device
hipStream_t stream[MaxGPUs];
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device
HIPCHECK(hipStreamCreate(&stream[i]));
}
// Allocate host and device memory
int nBytes = sizeof(int) * 2 * blockSize;
int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
int *syncTestD[MaxGPUs], *syncResultD;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
if (i == 0) {
HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
}
}
// Launch Kernel
constexpr int NumKernelArgs = 6;
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
void* args[MaxGPUs * NumKernelArgs];
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
args[i * NumKernelArgs ] = &sizeTestD[i];
args[i * NumKernelArgs + 1] = &gridRankTestD[i];
args[i * NumKernelArgs + 2] = &thdRankTestD[i];
args[i * NumKernelArgs + 3] = &isValidTestD[i];
args[i * NumKernelArgs + 4] = &syncTestD[i];
args[i * NumKernelArgs + 5] = &syncResultD;
launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_base_type);
launchParamsList[i].gridDim = 2;
launchParamsList[i].blockDim = blockSize;
launchParamsList[i].sharedMem = 0;
launchParamsList[i].stream = stream[i];
launchParamsList[i].args = &args[i * NumKernelArgs];
}
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
// Copy result from device to host
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
}
// Validate results
int gridsSeen[MaxGPUs];
for (int i = 0; i < nGpu; ++i) {
for (int j = 0; j < 2 * blockSize; ++j) {
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
ASSERT_GE(gridRankTestH[i][j], 0);
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
int gridRank = gridRankTestH[i][j];
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
ASSERT_EQUAL(isValidTestH[i][j], 1);
}
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
// Validate uniqueness property of grid rank
gridsSeen[i] = gridRankTestH[i][0];
for (int k = 0; k < i; ++k) {
if (gridsSeen[k] == gridsSeen[i]) {
assert (false && "Grid rank in multi-gpu setup should be unique");
}
}
}
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
// Free host and device memory
delete [] launchParamsList;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipFree(sizeTestD[i]));
HIPCHECK(hipFree(gridRankTestD[i]));
HIPCHECK(hipFree(thdRankTestD[i]));
HIPCHECK(hipFree(isValidTestD[i]));
HIPCHECK(hipFree(syncTestD[i]));
if (i == 0)
HIPCHECK(hipHostFree(syncResultD));
HIPCHECK(hipHostFree(sizeTestH[i]));
HIPCHECK(hipHostFree(gridRankTestH[i]));
HIPCHECK(hipHostFree(thdRankTestH[i]));
HIPCHECK(hipHostFree(isValidTestH[i]));
}
}
TEST_CASE("Unit_hipCGMultiGridGroupType_BaseType") {
// Set `maxThreadsPerBlock` by taking minimum among all available devices
int nGpu = 0;
HIPCHECK(hipGetDeviceCount(&nGpu));
nGpu = min(nGpu, MaxGPUs);
int maxThreadsPerBlock = INT_MAX;
hipDeviceProp_t deviceProperties;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
if (!deviceProperties.cooperativeMultiDeviceLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
}
// Test for blockSizes in powers of 2
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
test_cg_multi_grid_group_type_via_base_type(blockSize, nGpu);
}
// Test for random blockSizes, but the sequence is the same every execution
srand(0);
for (int i = 0; i < 10; i++) {
// Test fails for 0 thread per block
test_cg_multi_grid_group_type_via_base_type(max(2, rand() % maxThreadsPerBlock), nGpu);
}
}
@@ -1,230 +0,0 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80
* TEST: %t
* HIT_END
*/
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
#include <cmath>
#include <cstdlib>
#include <climits>
#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
using namespace cooperative_groups;
constexpr int MaxGPUs = 8;
static __global__
void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD,
int* gridRankTestD,
int *thdRankTestD,
int *isValidTestD,
int *syncTestD,
int *syncResultD)
{
multi_grid_group mg = this_multi_grid();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test group_size api
sizeTestD[gIdx] = group_size(mg);
// Test thread_rank api
gridRankTestD[gIdx] = this_multi_grid().grid_rank();
thdRankTestD[gIdx] = thread_rank(mg);
// Test is_valid api
isValidTestD[gIdx] = mg.is_valid();
// Test sync api
//
// Eech thread assign 1 to their respective location
syncTestD[gIdx] = 1;
// Grid level sync
sync(this_grid());
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
syncTestD[0] += syncTestD[i];
}
syncResultD[this_multi_grid().grid_rank() + 1] = syncTestD[0];
}
// multi-grid level sync via public api
sync(mg);
// grid (gpu) 0 does final reduction across all grids (gpus)
if (this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
syncResultD[0] = 0;
for (uint i = 1; i <= this_multi_grid().num_grids(); ++i) {
syncResultD[0] += syncResultD[i];
}
}
}
static void test_cg_multi_grid_group_type_via_public_api(int blockSize, int nGpu)
{
// Create a stream each device
hipStream_t stream[MaxGPUs];
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device
HIPCHECK(hipStreamCreate(&stream[i]));
}
// Allocate host and device memory
int nBytes = sizeof(int) * 2 * blockSize;
int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs];
int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs];
int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs];
int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs];
int *syncTestD[MaxGPUs], *syncResultD;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMalloc(&sizeTestD[i], nBytes));
HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes));
HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes));
HIPCHECK(hipMalloc(&isValidTestD[i], nBytes));
HIPCHECK(hipMalloc(&syncTestD[i], nBytes));
HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes));
HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes));
if (i == 0) {
HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent));
}
}
// Launch Kernel
constexpr int NumKernelArgs = 6;
hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu];
void* args[MaxGPUs * NumKernelArgs];
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
args[i * NumKernelArgs ] = &sizeTestD[i];
args[i * NumKernelArgs + 1] = &gridRankTestD[i];
args[i * NumKernelArgs + 2] = &thdRankTestD[i];
args[i * NumKernelArgs + 3] = &isValidTestD[i];
args[i * NumKernelArgs + 4] = &syncTestD[i];
args[i * NumKernelArgs + 5] = &syncResultD;
launchParamsList[i].func = reinterpret_cast<void*>(kernel_cg_multi_grid_group_type_via_public_api);
launchParamsList[i].gridDim = 2;
launchParamsList[i].blockDim = blockSize;
launchParamsList[i].sharedMem = 0;
launchParamsList[i].stream = stream[i];
launchParamsList[i].args = &args[i * NumKernelArgs];
}
HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0));
// Copy result from device to host
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost));
}
// Validate results
int gridsSeen[MaxGPUs];
for (int i = 0; i < nGpu; ++i) {
for (int j = 0; j < 2 * blockSize; ++j) {
ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize);
ASSERT_GE(gridRankTestH[i][j], 0);
ASSERT_LE(gridRankTestH[i][j], nGpu-1);
ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]);
int gridRank = gridRankTestH[i][j];
ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j);
ASSERT_EQUAL(isValidTestH[i][j], 1);
}
ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize);
// Validate uniqueness property of grid rank
gridsSeen[i] = gridRankTestH[i][0];
for (int k = 0; k < i; ++k) {
if (gridsSeen[k] == gridsSeen[i]) {
assert (false && "Grid rank in multi-gpu setup should be unique");
}
}
}
ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize);
// Free host and device memory
delete [] launchParamsList;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipFree(sizeTestD[i]));
HIPCHECK(hipFree(gridRankTestD[i]));
HIPCHECK(hipFree(thdRankTestD[i]));
HIPCHECK(hipFree(isValidTestD[i]));
HIPCHECK(hipFree(syncTestD[i]));
if (i == 0)
HIPCHECK(hipHostFree(syncResultD));
HIPCHECK(hipHostFree(sizeTestH[i]));
HIPCHECK(hipHostFree(gridRankTestH[i]));
HIPCHECK(hipHostFree(thdRankTestH[i]));
HIPCHECK(hipHostFree(isValidTestH[i]));
}
}
TEST_CASE("Unit_hipCGMultiGridGroupType_PublicApi") {
// Set `maxThreadsPerBlock` by taking minimum among all available devices
int nGpu = 0;
HIPCHECK(hipGetDeviceCount(&nGpu));
nGpu = min(nGpu, MaxGPUs);
int maxThreadsPerBlock = INT_MAX;
hipDeviceProp_t deviceProperties;
for (int i = 0; i < nGpu; i++) {
HIPCHECK(hipGetDeviceProperties(&deviceProperties, i));
if (!deviceProperties.cooperativeMultiDeviceLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock);
}
// Test for blockSizes in powers of 2
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
test_cg_multi_grid_group_type_via_public_api(blockSize, nGpu);
}
// Test for random blockSizes, but the sequence is the same every execution
srand(0);
for (int i = 0; i < 10; i++) {
// Test fails for 0 thread per block
test_cg_multi_grid_group_type_via_public_api(max(2, rand() % maxThreadsPerBlock), nGpu);
}
}
+638
파일 보기
@@ -0,0 +1,638 @@
/*
Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
#include "hip_cg_common.hh"
namespace cg = cooperative_groups;
static __global__ void kernel_cg_multi_grid_group_type(int* grid_rank_dev, int* size_dev,
int* thd_rank_dev, int* is_valid_dev,
int* sync_dev, int* sync_result,
int* num_grids_dev) {
cg::multi_grid_group mg = cg::this_multi_grid();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test num_grids
num_grids_dev[gIdx] = mg.num_grids();
// Test grid_rank
grid_rank_dev[gIdx] = mg.grid_rank();
// Test size
size_dev[gIdx] = mg.size();
// Test thread_rank
thd_rank_dev[gIdx] = mg.thread_rank();
// Test is_valid
is_valid_dev[gIdx] = mg.is_valid();
// Test sync
//
// Eech thread assign 1 to their respective location
sync_dev[gIdx] = 1;
// Grid level sync
cg::this_grid().sync();
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
sync_dev[0] += sync_dev[i];
}
sync_result[mg.grid_rank() + 1] = sync_dev[0];
}
// multi-grid level sync
mg.sync();
// grid (gpu) 0 does final reduction across all grids (gpus)
if (mg.grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
sync_result[0] = 0;
for (uint i = 1; i <= mg.num_grids(); ++i) {
sync_result[0] += sync_result[i];
}
}
}
static __global__ void kernel_cg_multi_grid_group_type_via_base_type(
int* grid_rank_dev, int* size_dev, int* thd_rank_dev, int* is_valid_dev, int* sync_dev,
int* sync_result) {
cg::thread_group tg =
cg::this_multi_grid(); // This can work if _CG_ABI_EXPERIMENTAL defined on Cuda
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test size
size_dev[gIdx] = tg.size();
// Test thread_rank
grid_rank_dev[gIdx] = cg::this_multi_grid().grid_rank();
thd_rank_dev[gIdx] = tg.thread_rank();
// Test is_valid
#ifdef __HIP_PLATFORM_AMD__
is_valid_dev[gIdx] = tg.is_valid();
#else
// Cuda has no thread_group.is_valid()
is_valid_dev[gIdx] = true;
#endif
// Test sync
//
// Eech thread assign 1 to their respective location
sync_dev[gIdx] = 1;
// Grid level sync
cg::this_grid().sync();
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
sync_dev[0] += sync_dev[i];
}
sync_result[cg::this_multi_grid().grid_rank() + 1] = sync_dev[0];
}
// multi-grid level sync
tg.sync();
// grid (gpu) 0 does final reduction across all grids (gpus)
if (cg::this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
sync_result[0] = 0;
for (uint i = 1; i <= cg::this_multi_grid().num_grids(); ++i) {
sync_result[0] += sync_result[i];
}
}
}
static __global__ void kernel_cg_multi_grid_group_type_via_public_api(
int* grid_rank_dev, int* size_dev, int* thd_rank_dev, int* is_valid_dev, int* sync_dev,
int* sync_result) {
cg::multi_grid_group mg = cg::this_multi_grid();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test group_size api
size_dev[gIdx] = cg::group_size(mg);
// Test thread_rank api
grid_rank_dev[gIdx] = cg::this_multi_grid().grid_rank();
thd_rank_dev[gIdx] = cg::thread_rank(mg);
// Test is_valid api
is_valid_dev[gIdx] = mg.is_valid();
// Test sync api
//
// Eech thread assign 1 to their respective location
sync_dev[gIdx] = 1;
// Grid level sync
cg::sync(cg::this_grid());
// Thread 0 from work-group 0 of current grid (gpu) does grid level reduction
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (uint i = 1; i < gridDim.x * blockDim.x; ++i) {
sync_dev[0] += sync_dev[i];
}
sync_result[cg::this_multi_grid().grid_rank() + 1] = sync_dev[0];
}
// multi-grid level sync via public api
cg::sync(mg);
// grid (gpu) 0 does final reduction across all grids (gpus)
if (cg::this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) {
sync_result[0] = 0;
for (uint i = 1; i <= cg::this_multi_grid().num_grids(); ++i) {
sync_result[0] += sync_result[i];
}
}
}
static __global__ void test_kernel(unsigned int* atomic_val, unsigned int* global_array,
unsigned int* array, uint32_t loops) {
cg::grid_group grid = cg::this_grid();
cg::multi_grid_group mgrid = cg::this_multi_grid();
unsigned rank = grid.thread_rank();
unsigned global_rank = mgrid.thread_rank();
int offset = blockIdx.x;
for (int i = 0; i < loops; i++) {
// Make the last thread run way behind everyone else.
// If the grid barrier below fails, then the other threads may hit the
// atomicInc instruction many times before the last thread ever gets
// to it.
// As such, without the barrier, the last array entry will eventually
// contain a very large value, defined by however many times the other
// wavefronts make it through this loop.
// If the barrier works, then it will likely contain some number
// near "total number of blocks". It will be the last wavefront to
// reach the atomicInc, but everyone will have only hit the atomic once.
if (rank == (grid.size() - 1)) {
long long time_diff = 0;
long long last_clock = clock64();
do {
long long cur_clock = clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < 1000000);
}
if (threadIdx.x == 0) {
array[offset] = atomicInc(atomic_val, UINT_MAX);
}
grid.sync();
// Make the last thread in the entire multi-grid run way behind
// everyone else.
// If the mgrid barrier below fails, then the two global_array entries
// will end up being out of sync, because the intermingling of adds
// and multiplies will not be aligned between to the two GPUs.
if (global_rank == (mgrid.size() - 1)) {
long long time_diff = 0;
long long last_clock = clock64();
do {
long long cur_clock = clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < 1000000);
}
// During even iterations, add into your own array entry
// During odd iterations, add into your partner's array entry
unsigned grid_rank = mgrid.grid_rank();
unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
if (rank == (grid.size() - 1)) {
if (i % mgrid.num_grids() == 0) {
global_array[grid_rank] += 2;
} else {
global_array[inter_gpu_offset] *= 2;
}
}
mgrid.sync();
offset += gridDim.x;
}
}
__global__ void test_kernel_gfx11(unsigned int* atomic_val, unsigned int* global_array,
unsigned int* array, uint32_t loops) {
#if HT_AMD
cg::grid_group grid = cg::this_grid();
cg::multi_grid_group mgrid = cg::this_multi_grid();
unsigned rank = grid.thread_rank();
unsigned global_rank = mgrid.thread_rank();
int offset = blockIdx.x;
for (int i = 0; i < loops; i++) {
// Make the last thread run way behind everyone else.
// If the grid barrier below fails, then the other threads may hit the
// atomicInc instruction many times before the last thread ever gets
// to it.
// As such, without the barrier, the last array entry will eventually
// contain a very large value, defined by however many times the other
// wavefronts make it through this loop.
// If the barrier works, then it will likely contain some number
// near "total number of blocks". It will be the last wavefront to
// reach the atomicInc, but everyone will have only hit the atomic once.
if (rank == (grid.size() - 1)) {
long long time_diff = 0;
long long last_clock = wall_clock64();
do {
long long cur_clock = wall_clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < 1000000);
}
if (threadIdx.x == 0) {
array[offset] = atomicInc(atomic_val, UINT_MAX);
}
grid.sync();
// Make the last thread in the entire multi-grid run way behind
// everyone else.
// If the mgrid barrier below fails, then the two global_array entries
// will end up being out of sync, because the intermingling of adds
// and multiplies will not be aligned between to the two GPUs.
if (global_rank == (mgrid.size() - 1)) {
long long time_diff = 0;
long long last_clock = wall_clock64();
do {
long long cur_clock = wall_clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < 1000000);
}
// During even iterations, add into your own array entry
// During odd iterations, add into your partner's array entry
unsigned grid_rank = mgrid.grid_rank();
unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids();
if (rank == (grid.size() - 1)) {
if (i % mgrid.num_grids() == 0) {
global_array[grid_rank] += 2;
} else {
global_array[inter_gpu_offset] *= 2;
}
}
mgrid.sync();
offset += gridDim.x;
}
#endif
}
static void verify_barrier_buffer(unsigned int loops, unsigned int warps, unsigned int* host_buffer,
unsigned int num_devs) {
unsigned int max_in_this_loop = 0;
for (unsigned int i = 0; i < loops; i++) {
max_in_this_loop += (warps * num_devs);
for (unsigned int j = 0; j < warps; j++) {
REQUIRE(host_buffer[i * warps + j] <= max_in_this_loop);
}
}
}
static void verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) {
unsigned int desired_val = 0;
for (int i = 0; i < loops; i++) {
if (i % 2 == 0) {
desired_val += 2;
} else {
desired_val *= 2;
}
}
REQUIRE(array_val == desired_val);
}
template <typename F>
static void test_cg_multi_grid_group_type(F kernel_func, int num_devices, int block_size,
bool specific_api_test) {
// Create a stream each device
hipStream_t stream[MaxGPUs];
for (int i = 0; i < num_devices; i++) {
HIP_CHECK(hipSetDevice(i));
HIP_CHECK(hipDeviceSynchronize()); // Make sure work is done on this device
HIP_CHECK(hipStreamCreate(&stream[i]));
}
// Allocate host and device memory
int num_bytes = sizeof(int) * 2 * block_size;
int *num_grids_dev[MaxGPUs], *num_grids_host[MaxGPUs];
int *grid_rank_dev[MaxGPUs], *grid_rank_host[MaxGPUs];
int *size_dev[MaxGPUs], *size_host[MaxGPUs];
int *thd_rank_dev[MaxGPUs], *thd_rank_host[MaxGPUs];
int *is_valid_dev[MaxGPUs], *is_valid_host[MaxGPUs];
int *sync_dev[MaxGPUs], *sync_result;
for (int i = 0; i < num_devices; i++) {
HIP_CHECK(hipSetDevice(i));
if (specific_api_test) {
HIP_CHECK(hipMalloc(&num_grids_dev[i], num_bytes));
HIP_CHECK(hipHostMalloc(&num_grids_host[i], num_bytes));
}
HIP_CHECK(hipMalloc(&grid_rank_dev[i], num_bytes));
HIP_CHECK(hipMalloc(&size_dev[i], num_bytes));
HIP_CHECK(hipMalloc(&thd_rank_dev[i], num_bytes));
HIP_CHECK(hipMalloc(&is_valid_dev[i], num_bytes));
HIP_CHECK(hipMalloc(&sync_dev[i], num_bytes));
HIP_CHECK(hipHostMalloc(&grid_rank_host[i], num_bytes));
HIP_CHECK(hipHostMalloc(&size_host[i], num_bytes));
HIP_CHECK(hipHostMalloc(&thd_rank_host[i], num_bytes));
HIP_CHECK(hipHostMalloc(&is_valid_host[i], num_bytes));
if (i == 0) {
HIP_CHECK(
hipHostMalloc(&sync_result, sizeof(int) * (num_devices + 1), hipHostMallocCoherent));
}
}
// Launch Kernel
int NumKernelArgs = 6;
if (specific_api_test) {
NumKernelArgs = 7;
}
hipLaunchParams* launchParamsList = new hipLaunchParams[num_devices];
std::vector<void*> args(MaxGPUs * NumKernelArgs);
for (int i = 0; i < num_devices; i++) {
HIP_CHECK(hipSetDevice(i));
args[i * NumKernelArgs] = &grid_rank_dev[i];
args[i * NumKernelArgs + 1] = &size_dev[i];
args[i * NumKernelArgs + 2] = &thd_rank_dev[i];
args[i * NumKernelArgs + 3] = &is_valid_dev[i];
args[i * NumKernelArgs + 4] = &sync_dev[i];
args[i * NumKernelArgs + 5] = &sync_result;
if (specific_api_test) {
args[i * NumKernelArgs + 6] = &num_grids_dev[i];
}
launchParamsList[i].func = reinterpret_cast<void*>(kernel_func);
launchParamsList[i].gridDim = 2;
launchParamsList[i].blockDim = block_size;
launchParamsList[i].sharedMem = 0;
launchParamsList[i].stream = stream[i];
launchParamsList[i].args = &args[i * NumKernelArgs];
}
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, num_devices, 0));
// Copy result from device to host
for (int i = 0; i < num_devices; i++) {
HIP_CHECK(hipSetDevice(i));
if (specific_api_test) {
HIP_CHECK(hipMemcpy(num_grids_host[i], num_grids_dev[i], num_bytes, hipMemcpyDeviceToHost));
}
HIP_CHECK(hipMemcpy(grid_rank_host[i], grid_rank_dev[i], num_bytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(size_host[i], size_dev[i], num_bytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(thd_rank_host[i], thd_rank_dev[i], num_bytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(is_valid_host[i], is_valid_dev[i], num_bytes, hipMemcpyDeviceToHost));
}
// Validate results
int grids_seen[MaxGPUs];
for (int i = 0; i < num_devices; ++i) {
for (int j = 0; j < 2 * block_size; ++j) {
if (specific_api_test) {
ASSERT_EQUAL(num_grids_host[i][j], num_devices);
}
ASSERT_GE(grid_rank_host[i][j], 0);
ASSERT_LE(grid_rank_host[i][j], num_devices - 1);
ASSERT_EQUAL(grid_rank_host[i][j], grid_rank_host[i][0]);
ASSERT_EQUAL(size_host[i][j], num_devices * 2 * block_size);
int gridRank = grid_rank_host[i][j];
ASSERT_EQUAL(thd_rank_host[i][j], (gridRank * 2 * block_size) + j);
ASSERT_EQUAL(is_valid_host[i][j], 1);
}
ASSERT_EQUAL(sync_result[i + 1], 2 * block_size);
// Validate uniqueness property of grid rank
grids_seen[i] = grid_rank_host[i][0];
for (int k = 0; k < i; ++k) {
INFO("Grid rank in multi-gpu setup should be unique");
REQUIRE(grids_seen[k] != grids_seen[i]);
}
}
ASSERT_EQUAL(sync_result[0], num_devices * 2 * block_size);
// Free host and device memory
delete[] launchParamsList;
for (int i = 0; i < num_devices; i++) {
HIP_CHECK(hipSetDevice(i));
if (specific_api_test) {
HIP_CHECK(hipFree(num_grids_dev[i]));
HIP_CHECK(hipHostFree(num_grids_host[i]));
}
HIP_CHECK(hipFree(grid_rank_dev[i]));
HIP_CHECK(hipFree(size_dev[i]));
HIP_CHECK(hipFree(thd_rank_dev[i]));
HIP_CHECK(hipFree(is_valid_dev[i]));
HIP_CHECK(hipFree(sync_dev[i]));
if (i == 0) {
HIP_CHECK(hipHostFree(sync_result));
}
HIP_CHECK(hipHostFree(grid_rank_host[i]));
HIP_CHECK(hipHostFree(size_host[i]));
HIP_CHECK(hipHostFree(thd_rank_host[i]));
HIP_CHECK(hipHostFree(is_valid_host[i]));
}
}
TEST_CASE("Unit_hipCGMultiGridGroupType_Basic") {
int num_devices = 0;
HIP_CHECK(hipGetDeviceCount(&num_devices));
num_devices = min(num_devices, MaxGPUs);
// Set `max_threads_per_blk` by taking minimum among all available devices
int max_threads_per_blk = INT_MAX;
hipDeviceProp_t device_properties;
for (int i = 0; i < num_devices; i++) {
HIP_CHECK(hipGetDeviceProperties(&device_properties, i));
if (!device_properties.cooperativeMultiDeviceLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
max_threads_per_blk = min(max_threads_per_blk, device_properties.maxThreadsPerBlock);
}
void* (*kernel_func)(void);
bool specific_api_test = false;
SECTION("Default multi grid group API test") {
kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_multi_grid_group_type);
specific_api_test = true;
}
SECTION("Base type multi grid group API test") {
kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_multi_grid_group_type_via_base_type);
}
SECTION("Public API multi grid group test") {
kernel_func = reinterpret_cast<void* (*)()>(kernel_cg_multi_grid_group_type_via_public_api);
}
// Test for blockSizes in powers of 2
for (int block_size = 2; block_size <= max_threads_per_blk; block_size = block_size * 2) {
test_cg_multi_grid_group_type(kernel_func, num_devices, block_size, specific_api_test);
}
// Test for random blockSizes, but the sequence is the same every execution
srand(0);
for (int i = 0; i < 10; i++) {
// Test fails for 0 thread per block
test_cg_multi_grid_group_type(kernel_func, num_devices, max(2, rand() % max_threads_per_blk),
specific_api_test);
}
}
TEST_CASE("Unit_hipCGMultiGridGroupType_Barrier") {
int num_devices = 0;
uint32_t loops = GENERATE(1, 2, 3, 4);
uint32_t warps = GENERATE(4, 8, 16, 32);
uint32_t block_size = 1;
HIP_CHECK(hipGetDeviceCount(&num_devices));
if (num_devices < 2) {
HipTest::HIP_SKIP_TEST("Device number is < 2");
return;
}
std::vector<hipDeviceProp_t> device_properties(num_devices);
for (int i = 0; i < num_devices; i++) {
HIP_CHECK(hipGetDeviceProperties(&device_properties[i], i));
if (!device_properties[i].cooperativeMultiDeviceLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
}
// Test whether the requested size will fit on the GPU
std::vector<int> warp_sizes(num_devices);
std::vector<int> num_sms(num_devices);
int warp_size = INT_MAX;
int num_sm = INT_MAX;
for (int i = 0; i < num_devices; i++) {
warp_sizes[i] = device_properties[i].warpSize;
if (warp_sizes[i] < warp_size) {
warp_size = warp_sizes[i];
}
num_sms[i] = device_properties[i].multiProcessorCount;
if (num_sms[i] < num_sm) {
num_sm = num_sms[i];
}
}
int num_threads_in_block = block_size * warp_size;
// Calculate the device occupancy to know how many blocks can be run.
std::vector<int> max_blocks_per_sm_arr(num_devices);
int max_blocks_per_sm = INT_MAX;
for (int i = 0; i < num_devices; i++) {
HIP_CHECK(hipSetDevice(i));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm_arr[i], test_kernel_used, num_threads_in_block, 0));
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
max_blocks_per_sm = max_blocks_per_sm_arr[i];
}
}
int requested_blocks = warps / block_size;
// Each block will output a single value per loop.
uint32_t total_buffer_len = requested_blocks * loops;
// Alocate the buffer that will hold the kernel's output, and which will
// also be used to globally synchronize during GWS initialization
std::vector<unsigned int*> host_buffer(num_devices);
std::vector<unsigned int*> kernel_buffer(num_devices);
std::vector<unsigned int*> kernel_atomic(num_devices);
std::vector<hipStream_t> streams(num_devices);
for (int i = 0; i < num_devices; i++) {
host_buffer[i] =
reinterpret_cast<unsigned int*>(calloc(total_buffer_len, sizeof(unsigned int)));
HIP_CHECK(hipSetDevice(i));
HIP_CHECK(hipMalloc(&kernel_buffer[i], sizeof(unsigned int) * total_buffer_len));
HIP_CHECK(hipMemcpy(kernel_buffer[i], host_buffer[i], sizeof(unsigned int) * total_buffer_len,
hipMemcpyHostToDevice));
HIP_CHECK(hipMalloc(&kernel_atomic[i], sizeof(unsigned int)));
HIP_CHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int)));
HIP_CHECK(hipStreamCreate(&streams[i]));
}
// Single kernel atomic shared between both devices; put it on the host
unsigned int* global_array;
HIP_CHECK(hipHostMalloc(&global_array, sizeof(unsigned int) * num_devices));
HIP_CHECK(hipMemset(global_array, 0, num_devices * sizeof(unsigned int)));
// Launch the kernels
INFO("Launching a cooperative kernel with " << warps << " warps in " << requested_blocks
<< " thread blocks");
std::vector<std::vector<void*>> dev_params(num_devices, std::vector<void*>(4, nullptr));
std::vector<hipLaunchParams> md_params(num_devices);
for (int i = 0; i < num_devices; i++) {
HIP_CHECK(hipSetDevice(i));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
dev_params[i][0] = reinterpret_cast<void*>(&kernel_atomic[i]);
dev_params[i][1] = reinterpret_cast<void*>(&global_array);
dev_params[i][2] = reinterpret_cast<void*>(&kernel_buffer[i]);
dev_params[i][3] = reinterpret_cast<void*>(&loops);
md_params[i].func = reinterpret_cast<void*>(test_kernel_used);
md_params[i].gridDim = requested_blocks;
md_params[i].blockDim = num_threads_in_block;
md_params[i].sharedMem = 0;
md_params[i].stream = streams[i];
md_params[i].args = dev_params[i].data();
}
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params.data(), num_devices, 0));
HIP_CHECK(hipDeviceSynchronize());
// Read back the buffer to host
for (int dev = 0; dev < num_devices; dev++) {
HIP_CHECK(hipMemcpy(host_buffer[dev], kernel_buffer[dev],
sizeof(unsigned int) * total_buffer_len, hipMemcpyDeviceToHost));
}
for (unsigned int dev = 0; dev < num_devices; dev++) {
verify_barrier_buffer(loops, requested_blocks, host_buffer[dev], num_devices);
}
for (int dev = 0; dev < num_devices; dev++) {
verify_multi_gpu_buffer(loops, global_array[dev]);
}
HIP_CHECK(hipHostFree(global_array));
for (int k = 0; k < num_devices; ++k) {
HIP_CHECK(hipFree(kernel_buffer[k]));
HIP_CHECK(hipFree(kernel_atomic[k]));
HIP_CHECK(hipStreamDestroy(streams[k]));
free(host_buffer[k]);
}
}
@@ -0,0 +1,198 @@
/*
Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
#include "hip_cg_common.hh"
namespace cg = cooperative_groups;
enum class TiledGroupShflTests { shflDown, shflXor, shflUp };
template <unsigned int tileSz>
__device__ int reduction_kernel_shfl_down(cg::thread_block_tile<tileSz> const& g,
volatile int val) {
int sz = g.size();
for (int i = sz / 2; i > 0; i >>= 1) {
val += g.shfl_down(val, i);
}
// Choose the 0'th indexed thread that holds the reduction value to return
if (g.thread_rank() == 0) {
return val;
}
// Rest of the threads return no useful values
else {
return -1;
}
}
template <unsigned int tileSz>
__device__ int reduction_kernel_shfl_xor(cg::thread_block_tile<tileSz> const& g, int val) {
int sz = g.size();
for (int i = sz / 2; i > 0; i >>= 1) {
val += g.shfl_xor(val, i);
}
// Choose the 0'th indexed thread that holds the reduction value to return
if (g.thread_rank() == 0) {
return val;
}
// Rest of the threads return no useful values
else {
return -1;
}
}
template <unsigned int tileSz>
__device__ int prefix_sum_kernel(cg::thread_block_tile<tileSz> const& g, volatile int val) {
int sz = g.size();
#pragma unroll
for (int i = 1; i < sz; i <<= 1) {
int temp = g.shfl_up(val, i);
if (g.thread_rank() >= i) {
val += temp;
}
}
return val;
}
template <unsigned int tile_size>
static __global__ void kernel_cg_group_partition_static(int* result,
TiledGroupShflTests shfl_test) {
cg::thread_block thread_block_CG_ty = cg::this_thread_block();
int input, output_sum;
// Choose a leader thread to print the results
if (thread_block_CG_ty.thread_rank() == 0) {
printf(" Creating %d groups, of tile size %d threads:\n\n",
(int)thread_block_CG_ty.size() / tile_size, tile_size);
}
thread_block_CG_ty.sync();
cg::thread_block_tile<tile_size> tiled_part = cg::tiled_partition<tile_size>(thread_block_CG_ty);
input = tiled_part.thread_rank();
switch (shfl_test) {
case (TiledGroupShflTests::shflDown):
output_sum = reduction_kernel_shfl_down(tiled_part, input);
break;
case (TiledGroupShflTests::shflXor):
output_sum = reduction_kernel_shfl_xor(tiled_part, input);
break;
case (TiledGroupShflTests::shflUp):
output_sum = prefix_sum_kernel(tiled_part, input);
result[thread_block_CG_ty.thread_rank()] = output_sum;
}
if (tiled_part.thread_rank() == 0 && shfl_test != TiledGroupShflTests::shflUp) {
printf(" Sum of all ranks 0..%d in this tiled_part group is %d\n", tiled_part.size() - 1,
output_sum);
result[thread_block_CG_ty.thread_rank() / (tile_size)] = output_sum;
}
}
static void expected_result_calc(int* expected_result, int tile_size, int size,
TiledGroupShflTests shfl_test) {
switch (shfl_test) {
case (TiledGroupShflTests::shflDown):
case (TiledGroupShflTests::shflXor): {
int expected_sum = ((tile_size - 1) * tile_size / 2);
for (int i = 0; i < size; i++) {
expected_result[i] = expected_sum;
}
break;
}
case (TiledGroupShflTests::shflUp): {
for (int i = 0; i < size / tile_size; i++) {
int acc = 0;
for (int j = 0; j < tile_size; j++) {
acc += j;
expected_result[i * tile_size + j] = acc;
}
}
break;
}
}
}
template <unsigned int tile_size> static void test_group_partition(TiledGroupShflTests shfl_test) {
int block_size = 1;
int threads_per_blk = 64;
int num_elem = (block_size * threads_per_blk) / tile_size;
if (shfl_test == TiledGroupShflTests::shflUp) {
num_elem = block_size * threads_per_blk;
}
int* expected_result = new int[num_elem];
int* result_dev = NULL;
int* result_host = NULL;
HIP_CHECK(hipHostMalloc(&result_host, num_elem * sizeof(int), hipHostMallocDefault));
memset(result_host, 0, num_elem * sizeof(int));
HIP_CHECK(hipMalloc(&result_dev, num_elem * sizeof(int)));
// Launch Kernel
hipLaunchKernelGGL(kernel_cg_group_partition_static<tile_size>, block_size, threads_per_blk,
threads_per_blk * sizeof(int), 0, result_dev, shfl_test);
HIP_CHECK(hipDeviceSynchronize());
HIP_CHECK(hipMemcpy(result_host, result_dev, sizeof(int) * num_elem, hipMemcpyDeviceToHost));
expected_result_calc(expected_result, tile_size, num_elem, shfl_test);
compareResults(expected_result, result_host, num_elem * sizeof(int));
// Free all allocated memory on host and device
HIP_CHECK(hipFree(result_dev));
HIP_CHECK(hipHostFree(result_host));
delete[] expected_result;
}
TEST_CASE("Unit_hipCGThreadBlockTileType_Shfl") {
// Use default device for validating the test
int device;
hipDeviceProp_t device_properties;
HIP_CHECK(hipGetDevice(&device));
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
if (!device_properties.cooperativeLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
TiledGroupShflTests shfl_test = GENERATE(
TiledGroupShflTests::shflDown, TiledGroupShflTests::shflXor, TiledGroupShflTests::shflUp);
test_group_partition<2>(shfl_test);
test_group_partition<4>(shfl_test);
test_group_partition<8>(shfl_test);
test_group_partition<16>(shfl_test);
test_group_partition<32>(shfl_test);
}
-177
파일 보기
@@ -1,177 +0,0 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
#include <cstdlib>
#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs)
using namespace cooperative_groups;
static __global__
void kernel_cg_thread_block_type(int *sizeTestD,
int *thdRankTestD,
int *syncTestD,
dim3 *groupIndexTestD,
dim3 *thdIndexTestD,
dim3 *groupDimTestD)
{
thread_block tb = this_thread_block();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test size
sizeTestD[gIdx] = tb.size();
// Test thread_rank
thdRankTestD[gIdx] = tb.thread_rank();
// Test sync
__shared__ int sm[2];
if (threadIdx.x == 0)
sm[0] = 10;
else if (threadIdx.x == 1)
sm[1] = 20;
tb.sync();
syncTestD[gIdx] = sm[1] * sm[0];
// Test group_index
groupIndexTestD[gIdx] = tb.group_index();
// Test thread_index
thdIndexTestD[gIdx] = tb.thread_index();
// Test group_dim aka number of threads in a block
groupDimTestD[gIdx] = tb.group_dim();
}
static void test_cg_thread_block_type(int blockSize)
{
int nBytes = sizeof(int) * 2 * blockSize;
int nDim3Bytes = sizeof(dim3) * 2 * blockSize;
int *sizeTestD, *sizeTestH;
int *thdRankTestD, *thdRankTestH;
int *syncTestD, *syncTestH;
dim3 *groupIndexTestD, *groupIndexTestH;
dim3 *thdIndexTestD, *thdIndexTestH, *groupDimTestD, *groupDimTestH;
// Allocate device memory
HIPCHECK(hipMalloc(&sizeTestD, nBytes));
HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
HIPCHECK(hipMalloc(&syncTestD, nBytes));
HIPCHECK(hipMalloc(&groupIndexTestD, nDim3Bytes));
HIPCHECK(hipMalloc(&thdIndexTestD, nDim3Bytes));
HIPCHECK(hipMalloc(&groupDimTestD, nDim3Bytes));
// Allocate host memory
HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
HIPCHECK(hipHostMalloc(&groupIndexTestH, nDim3Bytes));
HIPCHECK(hipHostMalloc(&thdIndexTestH, nDim3Bytes));
HIPCHECK(hipHostMalloc(&groupDimTestH, nDim3Bytes));
// Launch Kernel
hipLaunchKernelGGL(kernel_cg_thread_block_type,
2,
blockSize,
0,
0,
sizeTestD,
thdRankTestD,
syncTestD,
groupIndexTestD,
thdIndexTestD,
groupDimTestD);
// Copy result from device to host
HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(groupIndexTestH, groupIndexTestD, nDim3Bytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(thdIndexTestH, thdIndexTestD, nDim3Bytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(groupDimTestH, groupDimTestD, nDim3Bytes, hipMemcpyDeviceToHost));
// Validate results for both blocks together
for (int i = 0; i < 2 * blockSize; ++i) {
ASSERT_EQUAL(sizeTestH[i], blockSize);
ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
ASSERT_EQUAL(syncTestH[i], 200);
ASSERT_EQUAL(groupIndexTestH[i].x, (uint) i / blockSize);
ASSERT_EQUAL(groupIndexTestH[i].y, 0);
ASSERT_EQUAL(groupIndexTestH[i].z, 0);
ASSERT_EQUAL(thdIndexTestH[i].x, (uint) i % blockSize);
ASSERT_EQUAL(thdIndexTestH[i].y, 0);
ASSERT_EQUAL(thdIndexTestH[i].z, 0);
ASSERT_EQUAL(groupDimTestH[i].x, blockSize);
ASSERT_EQUAL(groupDimTestH[i].y, 1);
ASSERT_EQUAL(groupDimTestH[i].z, 1);
}
// Free device memory
HIPCHECK(hipFree(sizeTestD));
HIPCHECK(hipFree(thdRankTestD));
HIPCHECK(hipFree(syncTestD));
HIPCHECK(hipFree(groupIndexTestD));
HIPCHECK(hipFree(thdIndexTestD));
HIPCHECK(hipFree(groupDimTestD));
//Free host memory
HIPCHECK(hipHostFree(sizeTestH));
HIPCHECK(hipHostFree(thdRankTestH));
HIPCHECK(hipHostFree(syncTestH));
HIPCHECK(hipHostFree(groupIndexTestH));
HIPCHECK(hipHostFree(thdIndexTestH));
HIPCHECK(hipHostFree(groupDimTestH));
}
TEST_CASE("Unit_hipCGThreadBlockType") {
// Use default device for validating the test
int deviceId;
hipDeviceProp_t deviceProperties;
HIPCHECK(hipGetDevice(&deviceId));
HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
if (!deviceProperties.cooperativeLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
// Test for blockSizes in powers of 2
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
test_cg_thread_block_type(blockSize);
}
// Test for random blockSizes, but the sequence is the same every execution
srand(0);
for (int i = 0; i < 10; i++) {
// Test fails for only 1 thread per block
test_cg_thread_block_type(max(2, rand() % maxThreadsPerBlock));
}
}
@@ -1,136 +0,0 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
#include <hip_test_common.hh>
#include "hip/hip_cooperative_groups.h"
#include <cstdlib>
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
using namespace cooperative_groups;
static __global__
void kernel_cg_thread_block_type_via_base_type(int *sizeTestD,
int *thdRankTestD,
int *syncTestD)
{
thread_group tg = this_thread_block();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test size
sizeTestD[gIdx] = tg.size();
// Test thread_rank
thdRankTestD[gIdx] = tg.thread_rank();
// Test sync
__shared__ int sm[2];
if (threadIdx.x == 0)
sm[0] = 10;
else if (threadIdx.x == 1)
sm[1] = 20;
tg.sync();
syncTestD[gIdx] = sm[1] * sm[0];
}
static void test_cg_thread_block_type_via_base_type(int blockSize)
{
int nBytes = sizeof(int) * 2 * blockSize;
int *sizeTestD, *sizeTestH;
int *thdRankTestD, *thdRankTestH;
int *syncTestD, *syncTestH;
// Allocate device memory
HIPCHECK(hipMalloc(&sizeTestD, nBytes));
HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
HIPCHECK(hipMalloc(&syncTestD, nBytes));
// Allocate host memory
HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
// Launch Kernel
hipLaunchKernelGGL(kernel_cg_thread_block_type_via_base_type,
2,
blockSize,
0,
0,
sizeTestD,
thdRankTestD,
syncTestD);
// Copy result from device to host
HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
// Validate results for both blocks together
for (int i = 0; i < 2 * blockSize; ++i) {
ASSERT_EQUAL(sizeTestH[i], blockSize);
ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
ASSERT_EQUAL(syncTestH[i], 200);
}
// Free device memory
HIPCHECK(hipFree(sizeTestD));
HIPCHECK(hipFree(thdRankTestD));
HIPCHECK(hipFree(syncTestD));
//Free host memory
HIPCHECK(hipHostFree(sizeTestH));
HIPCHECK(hipHostFree(thdRankTestH));
HIPCHECK(hipHostFree(syncTestH));
}
TEST_CASE("Unit_hipCGThreadBlockType_BaseType") {
// Use default device for validating the test
int deviceId;
hipDeviceProp_t deviceProperties;
HIPCHECK(hipGetDevice(&deviceId));
HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
if (!deviceProperties.cooperativeLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
// Test for blockSizes in powers of 2
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
test_cg_thread_block_type_via_base_type(blockSize);
}
// Test for random blockSizes, but the sequence is the same every execution
srand(0);
for (int i = 0; i < 10; i++) {
// Test fails for only 1 thread per block
test_cg_thread_block_type_via_base_type(max(2, rand() % maxThreadsPerBlock));
}
}
@@ -1,136 +0,0 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* TEST: %t
* HIT_END
*/
#include <hip_test_common.hh>
#include "hip/hip_cooperative_groups.h"
#include <cstdlib>
#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs)
using namespace cooperative_groups;
static __global__
void kernel_cg_thread_block_type_via_public_api(int *sizeTestD,
int *thdRankTestD,
int *syncTestD)
{
thread_block tb = this_thread_block();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test group_size api
sizeTestD[gIdx] = group_size(tb);
// Test thread_rank api
thdRankTestD[gIdx] = thread_rank(tb);
// Test sync api
__shared__ int sm[2];
if (threadIdx.x == 0)
sm[0] = 10;
else if (threadIdx.x == 1)
sm[1] = 20;
sync(tb);
syncTestD[gIdx] = sm[1] * sm[0];
}
static void test_cg_thread_block_type_via_public_api(int blockSize)
{
int nBytes = sizeof(int) * 2 * blockSize;
int *sizeTestD, *sizeTestH;
int *thdRankTestD, *thdRankTestH;
int *syncTestD, *syncTestH;
// Allocate device memory
HIPCHECK(hipMalloc(&sizeTestD, nBytes));
HIPCHECK(hipMalloc(&thdRankTestD, nBytes));
HIPCHECK(hipMalloc(&syncTestD, nBytes));
// Allocate host memory
HIPCHECK(hipHostMalloc(&sizeTestH, nBytes));
HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes));
HIPCHECK(hipHostMalloc(&syncTestH, nBytes));
// Launch Kernel
hipLaunchKernelGGL(kernel_cg_thread_block_type_via_public_api,
2,
blockSize,
0,
0,
sizeTestD,
thdRankTestD,
syncTestD);
// Copy result from device to host
HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost));
HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost));
// Validate results for both blocks together
for (int i = 0; i < 2 * blockSize; ++i) {
ASSERT_EQUAL(sizeTestH[i], blockSize);
ASSERT_EQUAL(thdRankTestH[i], i % blockSize);
ASSERT_EQUAL(syncTestH[i], 200);
}
// Free device memory
HIPCHECK(hipFree(sizeTestD));
HIPCHECK(hipFree(thdRankTestD));
HIPCHECK(hipFree(syncTestD));
//Free host memory
HIPCHECK(hipHostFree(sizeTestH));
HIPCHECK(hipHostFree(thdRankTestH));
HIPCHECK(hipHostFree(syncTestH));
}
TEST_CASE("Unit_hipCGThreadBlockType_PublicApi") {
// Use default device for validating the test
int deviceId;
hipDeviceProp_t deviceProperties;
HIPCHECK(hipGetDevice(&deviceId));
HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId));
if (!deviceProperties.cooperativeLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
// Test for blockSizes in powers of 2
int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock;
for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) {
test_cg_thread_block_type_via_public_api(blockSize);
}
// Test for random blockSizes, but the sequence is the same every execution
srand(0);
for (int i = 0; i < 10; i++) {
// Test fails for only 1 thread per block
test_cg_thread_block_type_via_public_api(max(2, rand() % maxThreadsPerBlock));
}
}
+225
파일 보기
@@ -0,0 +1,225 @@
/*
Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
#include "hip_cg_common.hh"
namespace cg = cooperative_groups;
enum class ThreadBlockTypeTests { basicApi, baseType, publicApi };
static __global__ void kernel_cg_thread_block_type(int* size_dev, int* thd_rank_dev, int* sync_dev,
dim3* group_index_dev, dim3* thd_index_dev,
dim3* group_dim_dev) {
cg::thread_block tb = cg::this_thread_block();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test size
size_dev[gIdx] = tb.size();
// Test thread_rank
thd_rank_dev[gIdx] = tb.thread_rank();
// Test sync
__shared__ int sm[2];
if (threadIdx.x == 0)
sm[0] = 10;
else if (threadIdx.x == 1)
sm[1] = 20;
tb.sync();
sync_dev[gIdx] = sm[1] * sm[0];
// Test group_index
group_index_dev[gIdx] = tb.group_index();
// Test thread_index
thd_index_dev[gIdx] = tb.thread_index();
// Test group_dim aka number of threads in a block
group_dim_dev[gIdx] = tb.group_dim();
}
static __global__ void kernel_cg_thread_block_type_via_base_type(int* size_dev, int* thd_rank_dev,
int* sync_dev) {
cg::thread_group tg = cg::this_thread_block();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test size
size_dev[gIdx] = tg.size();
// Test thread_rank
thd_rank_dev[gIdx] = tg.thread_rank();
// Test sync
__shared__ int sm[2];
if (threadIdx.x == 0)
sm[0] = 10;
else if (threadIdx.x == 1)
sm[1] = 20;
tg.sync();
sync_dev[gIdx] = sm[1] * sm[0];
}
static __global__ void kernel_cg_thread_block_type_via_public_api(int* size_dev, int* thd_rank_dev,
int* sync_dev) {
cg::thread_block tb = cg::this_thread_block();
int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
// Test group_size api
size_dev[gIdx] = cg::group_size(tb);
// Test thread_rank api
thd_rank_dev[gIdx] = cg::thread_rank(tb);
// Test sync api
__shared__ int sm[2];
if (threadIdx.x == 0)
sm[0] = 10;
else if (threadIdx.x == 1)
sm[1] = 20;
cg::sync(tb);
sync_dev[gIdx] = sm[1] * sm[0];
}
static void test_cg_thread_block_type(ThreadBlockTypeTests test_type, int block_size) {
int num_bytes = sizeof(int) * 2 * block_size;
int num_dim3_bytes = sizeof(dim3) * 2 * block_size;
int *size_dev, *size_host;
int *thd_rank_dev, *thd_rank_host;
int *sync_dev, *sync_host;
dim3 *group_index_dev, *group_index_host;
dim3 *thd_index_dev, *thd_index_host;
dim3 *group_dim_dev, *group_dim_host;
// Allocate device memory
HIP_CHECK(hipMalloc(&size_dev, num_bytes));
HIP_CHECK(hipMalloc(&thd_rank_dev, num_bytes));
HIP_CHECK(hipMalloc(&sync_dev, num_bytes));
// Allocate host memory
HIP_CHECK(hipHostMalloc(&size_host, num_bytes));
HIP_CHECK(hipHostMalloc(&thd_rank_host, num_bytes));
HIP_CHECK(hipHostMalloc(&sync_host, num_bytes));
switch (test_type) {
case (ThreadBlockTypeTests::basicApi):
HIP_CHECK(hipMalloc(&group_index_dev, num_dim3_bytes));
HIP_CHECK(hipMalloc(&thd_index_dev, num_dim3_bytes));
HIP_CHECK(hipMalloc(&group_dim_dev, num_dim3_bytes));
HIP_CHECK(hipHostMalloc(&group_index_host, num_dim3_bytes));
HIP_CHECK(hipHostMalloc(&thd_index_host, num_dim3_bytes));
HIP_CHECK(hipHostMalloc(&group_dim_host, num_dim3_bytes));
hipLaunchKernelGGL(kernel_cg_thread_block_type, 2, block_size, 0, 0, size_dev, thd_rank_dev,
sync_dev, group_index_dev, thd_index_dev, group_dim_dev);
break;
case (ThreadBlockTypeTests::baseType):
hipLaunchKernelGGL(kernel_cg_thread_block_type_via_base_type, 2, block_size, 0, 0, size_dev,
thd_rank_dev, sync_dev);
break;
case (ThreadBlockTypeTests::publicApi):
hipLaunchKernelGGL(kernel_cg_thread_block_type_via_public_api, 2, block_size, 0, 0, size_dev,
thd_rank_dev, sync_dev);
}
// Copy result from device to host
HIP_CHECK(hipMemcpy(size_host, size_dev, num_bytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(thd_rank_host, thd_rank_dev, num_bytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(sync_host, sync_dev, num_bytes, hipMemcpyDeviceToHost));
if (test_type == ThreadBlockTypeTests::basicApi) {
HIP_CHECK(hipMemcpy(group_index_host, group_index_dev, num_dim3_bytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(thd_index_host, thd_index_dev, num_dim3_bytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipMemcpy(group_dim_host, group_dim_dev, num_dim3_bytes, hipMemcpyDeviceToHost));
}
// Validate results for both blocks together
for (int i = 0; i < 2 * block_size; ++i) {
ASSERT_EQUAL(size_host[i], block_size);
ASSERT_EQUAL(thd_rank_host[i], i % block_size);
ASSERT_EQUAL(sync_host[i], 200);
if (test_type == ThreadBlockTypeTests::basicApi) {
ASSERT_EQUAL(group_index_host[i].x, (uint)i / block_size);
ASSERT_EQUAL(group_index_host[i].y, 0);
ASSERT_EQUAL(group_index_host[i].z, 0);
ASSERT_EQUAL(thd_index_host[i].x, (uint)i % block_size);
ASSERT_EQUAL(thd_index_host[i].y, 0);
ASSERT_EQUAL(thd_index_host[i].z, 0);
ASSERT_EQUAL(group_dim_host[i].x, block_size);
ASSERT_EQUAL(group_dim_host[i].y, 1);
ASSERT_EQUAL(group_dim_host[i].z, 1);
}
}
// Free device memory
HIP_CHECK(hipFree(size_dev));
HIP_CHECK(hipFree(thd_rank_dev));
HIP_CHECK(hipFree(sync_dev));
// Free host memory
HIP_CHECK(hipHostFree(size_host));
HIP_CHECK(hipHostFree(thd_rank_host));
HIP_CHECK(hipHostFree(sync_host));
if (test_type == ThreadBlockTypeTests::basicApi) {
HIP_CHECK(hipFree(group_index_dev));
HIP_CHECK(hipFree(thd_index_dev));
HIP_CHECK(hipFree(group_dim_dev));
HIP_CHECK(hipHostFree(group_index_host));
HIP_CHECK(hipHostFree(thd_index_host));
HIP_CHECK(hipHostFree(group_dim_host));
}
}
TEST_CASE("Unit_hipCGThreadBlockType") {
// Use default device for validating the test
int device;
hipDeviceProp_t device_properties;
HIP_CHECK(hipGetDevice(&device));
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
if (!device_properties.cooperativeLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
ThreadBlockTypeTests test_type = ThreadBlockTypeTests::basicApi;
SECTION("Default thread block API test") { test_type = ThreadBlockTypeTests::basicApi; }
SECTION("Base type thread block API test") { test_type = ThreadBlockTypeTests::baseType; }
SECTION("Public API thread block test") { test_type = ThreadBlockTypeTests::publicApi; }
// Test for blockSizes in powers of 2
int max_threads_per_blk = device_properties.maxThreadsPerBlock;
for (int block_size = 2; block_size <= max_threads_per_blk; block_size = block_size * 2) {
test_cg_thread_block_type(test_type, block_size);
}
// Test for random block_size, but the sequence is the same every execution
srand(0);
for (int i = 0; i < 10; i++) {
// Test fails for only 1 thread per block
test_cg_thread_block_type(test_type, max(2, rand() % max_threads_per_blk));
}
}
-385
파일 보기
@@ -1,385 +0,0 @@
/*
Copyright (c) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Test Description:
/* This test implements sum reduction kernel, first with each threads own rank
as input and comparing the sum with expected sum output derieved from n(n-1)/2
formula. The second part, partitions this parent group into child subgroups
a.k.a tiles using using tiled_partition() collective operation. This can be called
with a static tile size, passed in templated non-type variable-tiled_partition<tileSz>,
or in runtime as tiled_partition(thread_group parent, tileSz). This test covers both these
cases.
This test tests functionality of cg group partitioning, (static and dynamic) and its respective
API's size(), thread_rank(), and sync().
*/
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
#include <stdio.h>
#include <vector>
using namespace cooperative_groups;
/* Parallel reduce kernel.
*
* Step complexity: O(log n)
* Work complexity: O(n)
*
* Note: This kernel works only with power of 2 input arrays.
*/
__device__ int reduction_kernel(thread_group g, int* x, int val) {
int lane = g.thread_rank();
for (int i = g.size() / 2; i > 0; i /= 2) {
// use lds to store the temporary result
x[lane] = val;
// Ensure all the stores are completed.
g.sync();
if (lane < i) {
val += x[lane + i];
}
// It must work on one tiled thread group at a time,
// and it must make sure all memory operations are
// completed before moving to the next stride.
// sync() here just does that.
g.sync();
}
// Choose the 0'th indexed thread that holds the reduction value to return
if (g.thread_rank() == 0) {
return val;
}
// Rest of the threads return no useful values
else {
return -1;
}
}
template <unsigned int tileSz>
__global__ void kernel_cg_group_partition_static(int* result, bool isGlobalMem, int* globalMem) {
thread_block threadBlockCGTy = this_thread_block();
int threadBlockGroupSize = threadBlockCGTy.size();
int* workspace = NULL;
if (isGlobalMem) {
workspace = globalMem;
} else {
// Declare a shared memory
extern __shared__ int sharedMem[];
workspace = sharedMem;
}
int input, outputSum, expectedOutput;
// we pass its own thread rank as inputs
input = threadBlockCGTy.thread_rank();
expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
outputSum = reduction_kernel(threadBlockCGTy, workspace, input);
// Choose a leader thread to print the results
if (threadBlockCGTy.thread_rank() == 0) {
printf(" Sum of all ranks 0..%d in threadBlockCooperativeGroup is %d (expected %d)\n\n",
(int)threadBlockCGTy.size() - 1, outputSum, expectedOutput);
printf(" Creating %d groups, of tile size %d threads:\n\n",
(int)threadBlockCGTy.size() / tileSz, tileSz);
}
threadBlockCGTy.sync();
thread_block_tile<tileSz> tiledPartition = tiled_partition<tileSz>(threadBlockCGTy);
// This offset allows each group to have its own unique area in the workspace array
int workspaceOffset = threadBlockCGTy.thread_rank() - tiledPartition.thread_rank();
outputSum = reduction_kernel(tiledPartition, workspace + workspaceOffset, input);
if (tiledPartition.thread_rank() == 0) {
printf(
" Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread "
"rank via meta_group_rank : %d and the total number of groups created when partitioned : %d\n",
tiledPartition.size() - 1, outputSum, tiledPartition.meta_group_rank(), tiledPartition.meta_group_size());
result[input / (tileSz)] = outputSum;
}
return;
}
__global__ void kernel_cg_group_partition_dynamic(unsigned int tileSz, int* result,
bool isGlobalMem, int* globalMem) {
thread_block threadBlockCGTy = this_thread_block();
int* workspace = NULL;
if (isGlobalMem) {
workspace = globalMem;
} else {
// Declare a shared memory
extern __shared__ int sharedMem[];
workspace = sharedMem;
}
int input, outputSum;
// input to reduction, for each thread, is its' rank in the group
input = threadBlockCGTy.thread_rank();
outputSum = reduction_kernel(threadBlockCGTy, workspace, input);
if (threadBlockCGTy.thread_rank() == 0) {
printf(" Sum of all ranks 0..%d in threadBlockCooperativeGroup is %d\n\n",
(int)threadBlockCGTy.size() - 1, outputSum);
printf(" Creating %d groups, of tile size %d threads:\n\n",
(int)threadBlockCGTy.size() / tileSz, tileSz);
}
threadBlockCGTy.sync();
thread_group tiledPartition = tiled_partition(threadBlockCGTy, tileSz);
// This offset allows each group to have its own unique area in the workspace array
int workspaceOffset = threadBlockCGTy.thread_rank() - tiledPartition.thread_rank();
outputSum = reduction_kernel(tiledPartition, workspace + workspaceOffset, input);
if (tiledPartition.thread_rank() == 0) {
printf(
" Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread "
" %d\n", tiledPartition.size() - 1, outputSum, input);
result[input / (tileSz)] = outputSum;
}
return;
}
// Search if the sum exists in the expected results array
void verifyResults(int* hPtr, int* dPtr, int size) {
int i = 0, j = 0;
for (i = 0; i < size; i++) {
for (j = 0; j < size; j++) {
if (hPtr[i] == dPtr[j]) {
break;
}
}
if (j == size) {
REQUIRE(" Result verification failed!");
}
}
}
template <unsigned int tileSz> static void test_group_partition(bool useGlobalMem) {
hipError_t err;
int blockSize = 1;
int threadsPerBlock = 64;
int numTiles = (blockSize * threadsPerBlock) / tileSz;
// Build an array of expected reduction sum output on the host
// based on the sum of their respective thread ranks for verification.
// eg: parent group has 64threads.
// child thread ranks: 0-15, 16-31, 32-47, 48-63
// expected sum: 120, 376, 632, 888
int* expectedSum = new int[numTiles];
int temp = 0, sum = 0;
for (int i = 1; i <= numTiles; i++) {
sum = temp;
temp = (((tileSz * i) - 1) * (tileSz * i)) / 2;
expectedSum[i-1] = temp - sum;
}
int* dResult = NULL;
HIPCHECK(hipMalloc((void**)&dResult, numTiles * sizeof(int)));
int* globalMem = NULL;
if (useGlobalMem) {
HIPCHECK(hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int)));
}
int* hResult = NULL;
HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault));
memset(hResult, 0, numTiles * sizeof(int));
if (useGlobalMem) {
// Launch Kernel
hipLaunchKernelGGL(kernel_cg_group_partition_static<tileSz>, blockSize, threadsPerBlock, 0, 0,
dResult, useGlobalMem, globalMem);
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
}
} else {
// Launch Kernel
hipLaunchKernelGGL(kernel_cg_group_partition_static<tileSz>, blockSize, threadsPerBlock,
threadsPerBlock * sizeof(int), 0, dResult, useGlobalMem, globalMem);
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
}
}
HIPCHECK(hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost));
verifyResults(expectedSum, hResult, numTiles);
// Free all allocated memory on host and device
HIPCHECK(hipFree(dResult));
HIPCHECK(hipFree(hResult));
if (useGlobalMem) {
HIPCHECK(hipFree(globalMem));
}
delete[] expectedSum;
printf("\n...PASSED.\n\n");
}
static void test_group_partition(unsigned int tileSz, bool useGlobalMem) {
hipError_t err;
int blockSize = 1;
int threadsPerBlock = 64;
int numTiles = (blockSize * threadsPerBlock) / tileSz;
// Build an array of expected reduction sum output on the host
// based on the sum of their respective thread ranks to use for verification
int* expectedSum = new int[numTiles];
int temp = 0, sum = 0;
for (int i = 1; i <= numTiles; i++) {
sum = temp;
temp = (((tileSz * i) - 1) * (tileSz * i)) / 2;
expectedSum[i-1] = temp - sum;
}
int* dResult = NULL;
HIPCHECK(hipMalloc(&dResult, sizeof(int) * numTiles));
int* globalMem = NULL;
if (useGlobalMem) {
HIPCHECK(hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int)));
}
int* hResult = NULL;
HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault));
memset(hResult, 0, numTiles * sizeof(int));
// Launch Kernel
if (useGlobalMem) {
hipLaunchKernelGGL(kernel_cg_group_partition_dynamic, blockSize, threadsPerBlock, 0, 0, tileSz,
dResult, useGlobalMem, globalMem);
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
}
} else {
hipLaunchKernelGGL(kernel_cg_group_partition_dynamic, blockSize, threadsPerBlock,
threadsPerBlock * sizeof(int), 0, tileSz, dResult, useGlobalMem, globalMem);
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err));
}
}
HIPCHECK(hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost));
verifyResults(expectedSum, hResult, numTiles);
// Free all allocated memory on host and device
HIPCHECK(hipFree(dResult));
HIPCHECK(hipFree(hResult));
if (useGlobalMem) {
HIPCHECK(hipFree(globalMem));
}
delete[] expectedSum;
printf("\n...PASSED.\n\n");
}
TEST_CASE("Unit_tiled_partition") {
// Use default device for validating the test
int deviceId;
HIP_CHECK_ERROR(hipGetDevice(&deviceId), hipSuccess);
hipDeviceProp_t deviceProperties;
HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess);
if (!deviceProperties.cooperativeLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
}
bool useGlobalMem = true;
std::cout << "Testing static tiled_partition for different tile sizes" << std::endl;
std::cout << "\nUsing global memory for computation\n";
/* Test static tile_partition */
std::cout << "TEST 1:" << '\n' << std::endl;
test_group_partition<2>(useGlobalMem);
std::cout << "TEST 2:" << '\n' << std::endl;
test_group_partition<4>(useGlobalMem);
std::cout << "TEST 3:" << '\n' << std::endl;
test_group_partition<8>(useGlobalMem);
std::cout << "TEST 4:" << '\n' << std::endl;
test_group_partition<16>(useGlobalMem);
std::cout << "TEST 5:" << '\n' << std::endl;
test_group_partition<32>(useGlobalMem);
useGlobalMem = false;
std::cout << "Testing static tiled_partition for different tile sizes" << std::endl;
std::cout << "\nUsing shared memory for computation\n";
/* Test static tile_partition */
std::cout << "TEST 1:" << '\n' << std::endl;
test_group_partition<2>(useGlobalMem);
std::cout << "TEST 2:" << '\n' << std::endl;
test_group_partition<4>(useGlobalMem);
std::cout << "TEST 3:" << '\n' << std::endl;
test_group_partition<8>(useGlobalMem);
std::cout << "TEST 4:" << '\n' << std::endl;
test_group_partition<16>(useGlobalMem);
std::cout << "TEST 5:" << '\n' << std::endl;
test_group_partition<32>(useGlobalMem);
std::cout << "Now testing dynamic tiled_partition for different tile sizes" << '\n' << std::endl;
/* Test dynamic group partition*/
useGlobalMem = true;
int testNo = 1;
std::vector<unsigned int> tileSizes = {2, 4, 8, 16, 32};
std::cout << "\nUsing global memory for computation\n";
for (auto i : tileSizes) {
std::cout << "TEST " << testNo << ":" << '\n' << std::endl;
test_group_partition(i, useGlobalMem);
testNo++;
}
useGlobalMem = false;
testNo = 1;
std::cout << "\nUsing shared memory for computation\n";
for (auto i : tileSizes) {
std::cout << "TEST " << testNo << ":" << '\n' << std::endl;
test_group_partition(i, useGlobalMem);
testNo++;
}
printf("\n...PASSED.\n\n");
return;
}
+279
파일 보기
@@ -0,0 +1,279 @@
/*
Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Test Description:
/* This test implements sum reduction kernel, first with each threads own rank
as input and comparing the sum with expected sum output derieved from n(n-1)/2
formula. The second part, partitions this parent group into child subgroups
a.k.a tiles using using tiled_partition() collective operation. This can be called
with a static tile size, passed in templated non-type variable-tiled_partition<tileSz>,
or in runtime as tiled_partition(thread_group parent, tileSz). This test covers both these
cases.
This test tests functionality of cg group partitioning, (static and dynamic) and its respective
API's size(), thread_rank(), and sync().
*/
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
#include <cstdlib>
#include "hip_cg_common.hh"
namespace cg = cooperative_groups;
/* Parallel reduce kernel.
*
* Step complexity: O(log n)
* Work complexity: O(n)
*
* Note: This kernel works only with power of 2 input arrays.
*/
__device__ int reduction_kernel(cg::thread_group g, int* x, int val) {
int lane = g.thread_rank();
for (int i = g.size() / 2; i > 0; i /= 2) {
// use lds to store the temporary result
x[lane] = val;
// Ensure all the stores are completed.
g.sync();
if (lane < i) {
val += x[lane + i];
}
// It must work on one tiled thread group at a time,
// and it must make sure all memory operations are
// completed before moving to the next stride.
// sync() here just does that.
g.sync();
}
// Choose the 0'th indexed thread that holds the reduction value to return
if (g.thread_rank() == 0) {
return val;
}
// Rest of the threads return no useful values
else {
return -1;
}
}
template <unsigned int tile_size>
__global__ void kernel_cg_group_partition_static(int* result, bool is_global_mem, int* global_mem) {
cg::thread_block thread_block_CG_ty = cg::this_thread_block();
int* workspace = NULL;
if (is_global_mem) {
workspace = global_mem;
} else {
// Declare a shared memory
extern __shared__ int shared_mem[];
workspace = shared_mem;
}
int input, output_sum, expected_output;
// input to reduction, for each thread, is its' rank in the group
input = thread_block_CG_ty.thread_rank();
expected_output = (thread_block_CG_ty.size() - 1) * thread_block_CG_ty.size() / 2;
output_sum = reduction_kernel(thread_block_CG_ty, workspace, input);
if (thread_block_CG_ty.thread_rank() == 0) {
printf(" Sum of all ranks 0..%d in threadBlockCooperativeGroup is %d (expected %d)\n\n",
(int)thread_block_CG_ty.size() - 1, output_sum, expected_output);
printf(" Creating %d groups, of tile size %d threads:\n\n",
(int)thread_block_CG_ty.size() / tile_size, tile_size);
}
thread_block_CG_ty.sync();
cg::thread_block_tile<tile_size> tiled_part = cg::tiled_partition<tile_size>(thread_block_CG_ty);
// This offset allows each group to have its own unique area in the workspace array
int workspace_offset = thread_block_CG_ty.thread_rank() - tiled_part.thread_rank();
output_sum = reduction_kernel(tiled_part, workspace + workspace_offset, input);
if (tiled_part.thread_rank() == 0) {
printf(
" Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread "
"rank: via meta_group_rank : %d and the total number of groups created when partitioned : "
"%d\n",
tiled_part.size() - 1, output_sum, tiled_part.meta_group_rank(),
tiled_part.meta_group_size());
result[input / (tile_size)] = output_sum;
}
return;
}
__global__ void kernel_cg_group_partition_dynamic(unsigned int tile_size, int* result,
bool is_global_mem, int* global_mem) {
cg::thread_block thread_block_CG_ty = cg::this_thread_block();
int* workspace = NULL;
if (is_global_mem) {
workspace = global_mem;
} else {
// Declare a shared memory
extern __shared__ int shared_mem[];
workspace = shared_mem;
}
int input, output_sum;
// input to reduction, for each thread, is its' rank in the group
input = thread_block_CG_ty.thread_rank();
output_sum = reduction_kernel(thread_block_CG_ty, workspace, input);
if (thread_block_CG_ty.thread_rank() == 0) {
printf("\n\n\n Sum of all ranks 0..%d in threadBlockCooperativeGroup is %d\n\n",
(int)thread_block_CG_ty.size() - 1, output_sum);
printf(" Creating %d groups, of tile size %d threads:\n\n",
(int)thread_block_CG_ty.size() / tile_size, tile_size);
}
thread_block_CG_ty.sync();
cg::thread_group tiled_part = cg::tiled_partition(thread_block_CG_ty, tile_size);
// This offset allows each group to have its own unique area in the workspace array
int workspace_offset = thread_block_CG_ty.thread_rank() - tiled_part.thread_rank();
output_sum = reduction_kernel(tiled_part, workspace + workspace_offset, input);
if (tiled_part.thread_rank() == 0) {
printf(
" Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread "
"rank: %d\n",
static_cast<int>(tiled_part.size()) - 1, output_sum, input);
result[input / (tile_size)] = output_sum;
}
return;
}
template <typename F>
static void common_group_partition(F kernel_func, unsigned int tile_size, void** params,
size_t num_params, bool use_global_mem) {
int block_size = 1;
int threads_per_blk = 64;
int num_tiles = (block_size * threads_per_blk) / tile_size;
// Build an array of expected reduction sum output on the host
// based on the sum of their respective thread ranks for verification.
// eg: parent group has 64threads.
// child thread ranks: 0-15, 16-31, 32-47, 48-63
// expected sum: 120, 376, 632, 888
int* expected_sum = new int[num_tiles];
int temp = 0, sum = 0;
for (int i = 1; i <= num_tiles; i++) {
sum = temp;
temp = (((tile_size * i) - 1) * (tile_size * i)) / 2;
expected_sum[i - 1] = temp - sum;
}
int* result_dev = NULL;
HIP_CHECK(hipMalloc((void**)&result_dev, num_tiles * sizeof(int)));
int* global_mem = NULL;
if (use_global_mem) {
HIP_CHECK(hipMalloc((void**)&global_mem, threads_per_blk * sizeof(int)));
}
int* result_host = NULL;
HIP_CHECK(hipHostMalloc(&result_host, num_tiles * sizeof(int), hipHostMallocDefault));
memset(result_host, 0, num_tiles * sizeof(int));
params[num_params + 0] = &result_dev;
params[num_params + 1] = &use_global_mem;
params[num_params + 2] = &global_mem;
if (use_global_mem) {
// Launch Kernel
HIP_CHECK(hipLaunchCooperativeKernel(kernel_func, block_size, threads_per_blk, params, 0, 0));
HIP_CHECK(hipDeviceSynchronize());
} else {
// Launch Kernel
HIP_CHECK(hipLaunchCooperativeKernel(kernel_func, block_size, threads_per_blk, params,
threads_per_blk * sizeof(int), 0));
HIP_CHECK(hipDeviceSynchronize());
}
HIP_CHECK(hipMemcpy(result_host, result_dev, num_tiles * sizeof(int), hipMemcpyDeviceToHost));
verifyResults(expected_sum, result_host, num_tiles);
// Free all allocated memory on host and device
HIP_CHECK(hipFree(result_dev));
HIP_CHECK(hipHostFree(result_host));
if (use_global_mem) {
HIP_CHECK(hipFree(global_mem));
}
delete[] expected_sum;
}
template <unsigned int tile_size> static void test_group_partition(bool use_global_mem) {
void* params[3];
size_t num_params = 0;
common_group_partition(kernel_cg_group_partition_static<tile_size>, tile_size, params, num_params,
use_global_mem);
}
static void test_group_partition(unsigned int tile_size, bool use_global_mem) {
void* params[4];
params[0] = &tile_size;
size_t num_params = 1;
common_group_partition(kernel_cg_group_partition_dynamic, tile_size, params, num_params,
use_global_mem);
}
TEST_CASE("Unit_hipCGThreadBlockTileType") {
// Use default device for validating the test
int device;
hipDeviceProp_t device_properties;
HIP_CHECK(hipGetDevice(&device));
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
if (!device_properties.cooperativeLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
bool use_global_mem = GENERATE(true, false);
SECTION("Static tile partition") {
test_group_partition<2>(use_global_mem);
test_group_partition<4>(use_global_mem);
test_group_partition<8>(use_global_mem);
test_group_partition<16>(use_global_mem);
test_group_partition<32>(use_global_mem);
}
SECTION("Dynamic tile partition") {
unsigned int tile_size = GENERATE(2, 4, 8, 16, 32);
test_group_partition(tile_size, use_global_mem);
}
}
@@ -0,0 +1,606 @@
/*
Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// Test Description:
/*The general idea of the application is to test how multi-GPU Cooperative
Groups kernel launches to a stream interact with other things that may be
simultaneously running in the same streams.
The HIP specification says that a multi-GPU cooperative launch will wait
until all of the streams it's using finish their work. Only then will the
cooperative kernel be launched to all of the devices. Then no other work
can take part in the any of the streams until all of the multi-GPU
cooperative work is done.
However, there are flags that allow you to disable each of these
serialization points: hipCooperativeLaunchMultiDeviceNoPreSync and
hipCooperativeLaunchMultiDeviceNoPostSync.
As such, this benchmark tests the following five situations launching
to two GPUs (and thus two streams):
1. Normal multi-GPU cooperative kernel:
This should result in the following pattern:
Stream 0: Cooperative
Stream 1: Cooperative
2. Regular kernel launches and multi-GPU cooperative kernel launches
with the default flags, resulting in the following pattern:
Stream 0: Regular --> Cooperative
Stream 1: --> Cooperative --> Regular
3. Regular kernel launches and multi-GPU cooperative kernel launches
that turn off "pre-sync". This should allow a cooperative kernel
to launch even if work is already in a stream pointing to
another GPU.
This should result in the following pattern:
Stream 0: Regular --> Cooperative
Stream 1: Cooperative --> Regular
4. Regular kernel launches and multi-GPU cooperative kernel launches
that turn off "post-sync". This should allow a new kernel to enter
a GPU even if another GPU still has a cooperative kernel on it.
This should result in the following pattern:
Stream 0: Regular --> Cooperative
Stream 1: --> Cooperative--> Regular
5. Regular kernel launches and multi-GPU cooperative kernel launches
that turn off both pre- and post-sync. This should allow any of
the kernels to launch to their GPU regardless of the status of
other kernels in other multi-GPU stream groups.
This should result in the following pattern:
Stream 0: Regular --> Cooperative
Stream 1: Cooperative --> Regular
We time how long it takes to run each of these benchmarks and print it as
the output of the benchmark. The kernels themselves are just useless time-
wasting code so that the kernel takes a meaningful amount of time on the
GPU before it exits. We only launch a single wavefront for each kernel, so
any serialization should not be because of GPU occupancy concerns.
If tests 2, 3, and 4 take roughly 3x as long as #1, that implies that
cooperative kernels are serialized as expected.
If test #5 takes roughly twice as long as #1, that implies that the
overlap-allowing flags work as expected.
*/
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
namespace cg = cooperative_groups;
static constexpr size_t kBufferLen = 1024 * 1024;
__global__ void test_gws(uint* buf, uint buf_size, long* tmp_buf, long* result) {
extern __shared__ long tmp[];
uint groups = gridDim.x;
uint group_id = blockIdx.x;
uint local_id = threadIdx.x;
uint chunk = gridDim.x * blockDim.x;
uint i = group_id * blockDim.x + local_id;
long sum = 0;
while (i < buf_size) {
sum += buf[i];
i += chunk;
}
tmp[local_id] = sum;
__syncthreads();
i = 0;
if (local_id == 0) {
sum = 0;
while (i < blockDim.x) {
sum += tmp[i];
i++;
}
tmp_buf[group_id] = sum;
}
// wait
cg::this_grid().sync();
if (((blockIdx.x * blockDim.x) + threadIdx.x) == 0) {
for (uint i = 1; i < groups; ++i) {
sum += tmp_buf[i];
}
//*result = sum;
result[1 + cg::this_multi_grid().grid_rank()] = sum;
}
cg::this_multi_grid().sync();
if (cg::this_multi_grid().grid_rank() == 0) {
sum = 0;
for (uint i = 1; i <= cg::this_multi_grid().num_grids(); ++i) {
sum += result[i];
}
*result = sum;
}
}
__global__ void test_coop_kernel(unsigned int loops, long long* array, int fast_gpu) {
cg::multi_grid_group mgrid = cg::this_multi_grid();
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
if (mgrid.grid_rank() == fast_gpu) {
return;
}
for (int i = 0; i < loops; i++) {
long long time_diff = 0;
long long last_clock = clock64();
do {
long long cur_clock = clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < 1000000);
array[rank] += clock64();
}
}
__global__ void test_coop_kernel_gfx11(unsigned int loops, long long* array, int fast_gpu) {
#if HT_AMD
cg::multi_grid_group mgrid = cg::this_multi_grid();
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
if (mgrid.grid_rank() == fast_gpu) {
return;
}
for (int i = 0; i < loops; i++) {
long long time_diff = 0;
long long last_clock = wall_clock64();
do {
long long cur_clock = wall_clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < 1000000);
array[rank] += wall_clock64();
}
#endif
}
__global__ void test_kernel(uint32_t loops, unsigned long long* array) {
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < loops; i++) {
long long time_diff = 0;
long long last_clock = clock64();
do {
long long cur_clock = clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < 1000000);
array[rank] += clock64();
}
}
__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long* array) {
#if HT_AMD
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < loops; i++) {
long long time_diff = 0;
long long last_clock = wall_clock64();
do {
long long cur_clock = wall_clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < 1000000);
array[rank] += wall_clock64();
}
#endif
}
static void verify_time(double single_kernel_time, double multi_kernel_time, float low_bound,
float high_bound) {
// Test that multiple kernel times are inside expected boundaries
REQUIRE(multi_kernel_time >= low_bound * single_kernel_time);
REQUIRE(multi_kernel_time <= high_bound * single_kernel_time);
}
void test_multigrid_streams(int device_num) {
uint32_t loops = 2000;
int32_t fast_gpu = -1;
// We will launch enough waves to fill up all of the GPU
int warp_sizes[2];
int num_sms[2];
hipDeviceProp_t device_properties[2];
int warp_size = INT_MAX;
int num_sm = INT_MAX;
for (int dev = 0; dev < (device_num - 1); ++dev) {
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipGetDeviceProperties(&device_properties[i], (dev + i)));
warp_sizes[i] = device_properties[i].warpSize;
if (warp_sizes[i] < warp_size) {
warp_size = warp_sizes[i];
}
num_sms[i] = device_properties[i].multiProcessorCount;
if (num_sms[i] < num_sm) {
num_sm = num_sms[i];
}
}
// Calculate the device occupancy to know how many blocks can be run.
int max_blocks_per_sm_arr[2];
int max_blocks_per_sm = INT_MAX;
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm_arr[i],
test_kernel_used, warp_size, 0));
if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) {
max_blocks_per_sm = max_blocks_per_sm_arr[i];
}
}
int desired_blocks = 1;
if (desired_blocks > max_blocks_per_sm * num_sm) {
INFO("The requested number of blocks will not fit on the GPU");
REQUIRE(desired_blocks < max_blocks_per_sm * num_sm);
return;
}
// Create the streams we will use in this test
hipStream_t streams[2];
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipStreamCreate(&streams[i]));
}
// Set up data to pass into the kernel
// Alocate the host input buffer, and two device-focused buffers that we
// will use for our test.
unsigned long long* dev_array[2];
for (int i = 0; i < 2; i++) {
int good_size = desired_blocks * warp_size * sizeof(long long);
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]), good_size));
HIP_CHECK(hipMemsetAsync(dev_array[i], 0, good_size, streams[i]));
}
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipDeviceSynchronize());
}
/* Launch the kernels ****************************************************/
void* dev_params[2][3];
hipLaunchParams md_params[2];
std::chrono::time_point<std::chrono::system_clock> start_time[2];
std::chrono::time_point<std::chrono::system_clock> end_time[2];
// Test 0: Launching a multi-GPU cooperative kernel
// Both GPUs launch a long cooperative kernel
INFO("GPU " << dev << ": Long Coop Kernel");
INFO("GPU " << (dev + 1) << ": Long Coop Kernel");
auto test_coop_kernel_used = IsGfx11() ? test_coop_kernel_gfx11 : test_coop_kernel;
for (int i = 0; i < 2; i++) {
dev_params[i][0] = reinterpret_cast<void*>(&loops);
dev_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
dev_params[i][2] = reinterpret_cast<void*>(&fast_gpu);
md_params[i].func = reinterpret_cast<void*>(test_coop_kernel_used);
md_params[i].gridDim = desired_blocks;
md_params[i].blockDim = warp_size;
md_params[i].sharedMem = 0;
md_params[i].stream = streams[i];
md_params[i].args = dev_params[i];
}
start_time[0] = std::chrono::system_clock::now();
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipDeviceSynchronize());
}
end_time[0] = std::chrono::system_clock::now();
std::chrono::duration<double> single_kernel_time = (end_time[0] - start_time[0]);
INFO("A single kernel on both GPUs took: " << single_kernel_time.count() << " seconds");
SECTION("GPU1 - Standard/ Long Coop, GPU2 - Coop/Standard") {
INFO("GPU " << dev << ": Standard/Long Coop");
INFO("GPU " << (dev + 1) << ": Coop/Standard");
fast_gpu = 1;
start_time[1] = std::chrono::system_clock::now();
HIP_CHECK(hipSetDevice(dev));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
loops, dev_array[0]);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
HIP_CHECK(hipSetDevice(dev + 1));
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
loops, dev_array[1]);
HIP_CHECK(hipGetLastError());
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipDeviceSynchronize());
}
end_time[1] = std::chrono::system_clock::now();
std::chrono::duration<double> serialized_gpu0_time = (end_time[1] - start_time[1]);
INFO("Serialized set of three kernels with GPU0 being long took: "
<< serialized_gpu0_time.count() << " seconds");
verify_time(single_kernel_time.count(), serialized_gpu0_time.count(), 2.7f, 3.3f);
}
SECTION("GPU1 - Standard/Coop, GPU2 - Long Coop/Standard") {
INFO("GPU " << dev << ": Standard/Coop");
INFO("GPU " << (dev + 1) << ": Long Coop/Standard");
fast_gpu = 0;
start_time[1] = std::chrono::system_clock::now();
HIP_CHECK(hipSetDevice(dev));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
loops, dev_array[0]);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0));
HIP_CHECK(hipSetDevice(dev + 1));
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
loops, dev_array[1]);
HIP_CHECK(hipGetLastError());
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipDeviceSynchronize());
}
end_time[1] = std::chrono::system_clock::now();
std::chrono::duration<double> serialized_gpu1_time = (end_time[1] - start_time[1]);
INFO("Serialized set of three kernels with GPU1 being long took: "
<< serialized_gpu1_time.count() << " seconds");
verify_time(single_kernel_time.count(), serialized_gpu1_time.count(), 2.7f, 3.3f);
}
SECTION(
"GPU1 - Standard/Coop, GPU2 - Long Coop/Standard - regular and coop kernel overlap at "
"beginning") {
INFO("GPU " << dev << ": Standard/Coop with multi device no pre sync");
INFO("GPU " << (dev + 1) << ": Long Coop/Standard with multi device no pre sync");
fast_gpu = 0;
start_time[1] = std::chrono::system_clock::now();
HIP_CHECK(hipSetDevice(dev));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
loops, dev_array[0]);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
hipCooperativeLaunchMultiDeviceNoPreSync));
HIP_CHECK(hipSetDevice(dev + 1));
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
loops, dev_array[1]);
HIP_CHECK(hipGetLastError());
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipDeviceSynchronize());
}
end_time[1] = std::chrono::system_clock::now();
std::chrono::duration<double> pre_overlapped_time = (end_time[1] - start_time[1]);
INFO("Multiple kernels with pre-overlap allowed took: " << pre_overlapped_time.count()
<< " seconds");
verify_time(single_kernel_time.count(), pre_overlapped_time.count(), 1.7f, 2.3f);
}
SECTION(
"GPU1 - Standard/Long Coop, GPU2 - Coop/Standard - regular and coop kernel overlap at "
"end") {
INFO("GPU " << dev << ": Standard/Long Coop with multi device no post sync");
INFO("GPU " << (dev + 1) << ": Coop/Standard with multi device no post sync");
fast_gpu = 1;
start_time[1] = std::chrono::system_clock::now();
HIP_CHECK(hipSetDevice(dev));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
loops, dev_array[0]);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2,
hipCooperativeLaunchMultiDeviceNoPostSync));
HIP_CHECK(hipSetDevice(dev + 1));
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
loops, dev_array[1]);
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipDeviceSynchronize());
}
end_time[1] = std::chrono::system_clock::now();
std::chrono::duration<double> post_overlapped_time = (end_time[1] - start_time[1]);
INFO("Multiple kernels with post-overlap allowed took: " << post_overlapped_time.count()
<< " seconds");
verify_time(single_kernel_time.count(), post_overlapped_time.count(), 1.7f, 2.3f);
}
SECTION(
"GPU1 - Standard/Long Coop, GPU2 - Long Coop/Standard - regular and coop kernel overlap") {
INFO("GPU " << dev << ": Standard/Long Coop with multi device no pre or post sync");
INFO("GPU " << (dev + 1) << ": Long Coop/Standard with multi device no pre or post sync");
start_time[1] = std::chrono::system_clock::now();
HIP_CHECK(hipSetDevice(dev));
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0],
loops, dev_array[0]);
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(
md_params, 2,
hipCooperativeLaunchMultiDeviceNoPreSync | hipCooperativeLaunchMultiDeviceNoPostSync));
HIP_CHECK(hipSetDevice(dev + 1));
test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1],
loops, dev_array[1]);
HIP_CHECK(hipGetLastError());
for (int i = 0; i < 2; i++) {
HIP_CHECK(hipSetDevice(dev + i));
HIP_CHECK(hipDeviceSynchronize());
}
end_time[1] = std::chrono::system_clock::now();
std::chrono::duration<double> overlapped_time = (end_time[1] - start_time[1]);
INFO("Multiple kernels with overlap allowed took: " << overlapped_time.count() << " seconds");
verify_time(single_kernel_time.count(), overlapped_time.count(), 1.8f, 2.2f);
}
for (int k = 0; k < 2; ++k) {
HIP_CHECK(hipFree(dev_array[k]));
HIP_CHECK(hipStreamDestroy(streams[k]));
}
}
}
TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic") {
constexpr uint num_kernel_args = 4;
int device_num = 0;
HIP_CHECK(hipGetDeviceCount(&device_num));
size_t buffer_size = kBufferLen * sizeof(int);
int* A_h = reinterpret_cast<int*>(malloc(buffer_size * device_num));
for (uint32_t i = 0; i < kBufferLen * device_num; ++i) {
A_h[i] = static_cast<int>(i);
}
std::vector<int*> A_d(device_num);
std::vector<long*> B_d(device_num);
long* C_d;
std::vector<hipStream_t> stream(device_num);
std::vector<hipDeviceProp_t> device_properties(device_num);
for (int i = 0; i < device_num; i++) {
HIP_CHECK(hipSetDevice(i));
// Calculate the device occupancy to know how many blocks can be run concurrently
HIP_CHECK(hipGetDeviceProperties(&device_properties[i], 0));
if (!device_properties[i].cooperativeMultiDeviceLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
HIP_CHECK(hipMalloc(&A_d[i], buffer_size));
HIP_CHECK(hipMemcpy(A_d[i], &A_h[i * kBufferLen], buffer_size, hipMemcpyHostToDevice));
if (i == 0) {
HIP_CHECK(hipHostMalloc(&C_d, (device_num + 1) * sizeof(long)));
}
HIP_CHECK(hipStreamCreate(&stream[i]));
HIP_CHECK(hipDeviceSynchronize());
}
dim3 dimBlock;
dim3 dimGrid;
dimGrid.x = 1;
dimGrid.y = 1;
dimGrid.z = 1;
dimBlock.x = 64;
dimBlock.y = 1;
dimBlock.z = 1;
int num_blocks = 0;
uint workgroup = GENERATE(64, 128, 256);
hipLaunchParams* launch_params_list = new hipLaunchParams[device_num];
std::vector<void*> args(device_num * num_kernel_args);
for (int i = 0; i < device_num; i++) {
HIP_CHECK(hipSetDevice(i));
dimBlock.x = workgroup;
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
&num_blocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)));
INFO("GPU" << i << " has block size = " << dimBlock.x << " and num blocks per CU " << num_blocks
<< "\n");
dimGrid.x = device_properties[i].multiProcessorCount * std::min(num_blocks, 32);
HIP_CHECK(hipMalloc(&B_d[i], dimGrid.x * sizeof(long)));
args[i * num_kernel_args] = (void*)&A_d[i];
args[i * num_kernel_args + 1] = (void*)&kBufferLen;
args[i * num_kernel_args + 2] = (void*)&B_d[i];
args[i * num_kernel_args + 3] = (void*)&C_d;
launch_params_list[i].func = reinterpret_cast<void*>(test_gws);
launch_params_list[i].gridDim = dimGrid;
launch_params_list[i].blockDim = dimBlock;
launch_params_list[i].sharedMem = dimBlock.x * sizeof(long);
launch_params_list[i].stream = stream[i];
launch_params_list[i].args = &args[i * num_kernel_args];
}
HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(launch_params_list, device_num, 0));
for (int i = 0; i < device_num; i++) {
HIP_CHECK(hipStreamSynchronize(stream[i]));
}
size_t processed_Dwords = kBufferLen * device_num;
REQUIRE(*C_d == (((long)(processed_Dwords) * (processed_Dwords - 1)) / 2));
delete[] launch_params_list;
HIP_CHECK(hipSetDevice(0));
HIP_CHECK(hipHostFree(C_d));
for (int i = 0; i < device_num; i++) {
HIP_CHECK(hipSetDevice(i));
HIP_CHECK(hipFree(A_d[i]));
HIP_CHECK(hipFree(B_d[i]));
HIP_CHECK(hipStreamDestroy(stream[i]));
}
free(A_h);
}
TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Streams") {
int device_num = 0;
HIP_CHECK(hipGetDeviceCount(&device_num));
if (device_num < 2) {
HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
return;
}
hipDeviceProp_t device_properties;
for (int i = 0; i < device_num; i++) {
HIP_CHECK(hipGetDeviceProperties(&device_properties, i));
if (!device_properties.cooperativeMultiDeviceLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
}
test_multigrid_streams(device_num);
}
@@ -0,0 +1,364 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
namespace cg = cooperative_groups;
static constexpr size_t kBufferLen = 1024 * 1024;
__global__ void test_gws(int* buf, size_t buf_size, long* tmp_buf, long* result) {
extern __shared__ long tmp[];
uint offset = blockIdx.x * blockDim.x + threadIdx.x;
uint stride = gridDim.x * blockDim.x;
cg::grid_group gg = cg::this_grid();
long sum = 0;
for (uint i = offset; i < buf_size; i += stride) {
sum += buf[i];
}
tmp[threadIdx.x] = sum;
__syncthreads();
if (threadIdx.x == 0) {
sum = 0;
for (uint i = 0; i < blockDim.x; i++) {
sum += tmp[i];
}
tmp_buf[blockIdx.x] = sum;
}
gg.sync();
if (offset == 0) {
for (uint i = 1; i < gridDim.x; ++i) {
sum += tmp_buf[i];
}
*result = sum;
}
}
__global__ void test_kernel(uint32_t loops, unsigned long long* array, long long totalTicks) {
cg::thread_block tb = cg::this_thread_block();
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < loops; i++) {
long long time_diff = 0;
long long last_clock = clock64();
do {
long long cur_clock = clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < totalTicks);
tb.sync();
array[rank] += clock64();
}
}
__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long* array, long long totalTicks) {
#if HT_AMD
cg::thread_block tb = cg::this_thread_block();
unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < loops; i++) {
long long time_diff = 0;
long long last_clock = wall_clock64();
do {
long long cur_clock = wall_clock64();
if (cur_clock > last_clock) {
time_diff += (cur_clock - last_clock);
}
// If it rolls over, we don't know how much to add to catch up.
// So just ignore those slipped cycles.
last_clock = cur_clock;
} while (time_diff < totalTicks);
tb.sync();
array[rank] += wall_clock64();
}
#endif
}
template <typename T>
static void verifyLeastCapacity(T& single_kernel_time, T& double_kernel_time,
T& triple_kernel_time) {
#if HT_AMD
// hipLaunchCooperativeKernel() follows serialization policy on AMD devices
// Test that the two cooperative kernels took roughly twice as long as the one
REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count());
REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count());
#else
// hipLaunchCooperativeKernel() doesn't follow serialization policy on NV devices
// Test that the two cooperative kernels took roughly as long as the one
REQUIRE(double_kernel_time.count() >= 0.8 * single_kernel_time.count());
REQUIRE(double_kernel_time.count() <= 1.2 * single_kernel_time.count());
#endif
// Test that the three kernels together took roughly as long as the two
// cooperative kernels.
REQUIRE(triple_kernel_time.count() <= 1.1 * double_kernel_time.count());
}
template <typename T>
static void verifyHalfCapacity(T& single_kernel_time, T& double_kernel_time,
T& triple_kernel_time) {
// Test that the two cooperative kernels took roughly twice as long as the one
REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count());
REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count());
// Test that the three kernels together took roughly as long as the two
// cooperative kernels.
REQUIRE(triple_kernel_time.count() <= 1.1 * double_kernel_time.count());
}
template <typename T>
static void verifyFullCapacity(T& single_kernel_time, T& double_kernel_time,
T& triple_kernel_time) {
// Test that the two cooperative kernels took roughly twice as long as the one
REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count());
REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count());
// Test that the three kernels together took roughly 1.6 times as long as the two
// cooperative kernels. If the first 2 kernels run very fast, the third
// won't share much time with the second kernel.
REQUIRE(triple_kernel_time.count() <= 1.7 * double_kernel_time.count());
}
template <typename T>
static void verify(int tests, T& single_kernel_time, T& double_kernel_time, T& triple_kernel_time) {
switch (tests) {
case 0:
verifyLeastCapacity(single_kernel_time, double_kernel_time, triple_kernel_time);
break;
case 1:
verifyHalfCapacity(single_kernel_time, double_kernel_time, triple_kernel_time);
break;
case 2:
verifyFullCapacity(single_kernel_time, double_kernel_time, triple_kernel_time);
break;
default:
break;
}
}
static void test_cooperative_streams(int dev, int p_tests) {
hipStream_t streams[3];
unsigned long long* dev_array[3];
int loops = 1000;
HIP_CHECK(hipSetDevice(dev));
hipDeviceProp_t device_properties;
HIP_CHECK(hipGetDeviceProperties(&device_properties, dev));
// Test whether target device supports cooperative groups
if (device_properties.cooperativeLaunch == 0) {
std::cout << "Cooperative group support not available in device " << dev << std::endl;
return;
}
// We will launch enough waves to fill up all of the GPU
int warp_size = device_properties.warpSize;
int num_sms = device_properties.multiProcessorCount;
long long totalTicks = device_properties.clockRate;
int max_blocks_per_sm = 0;
// Calculate the device occupancy to know how many blocks can be run.
auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel;
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, test_kernel_used,
warp_size, 0));
int max_active_blocks = max_blocks_per_sm * num_sms;
int coop_blocks = 0;
int reg_blocks = 0;
switch (p_tests) {
case 0:
// 1 block
coop_blocks = 1;
reg_blocks = 1;
break;
case 1:
// Half capacity
// To make sure the second kernel launched by hipLaunchCooperativeKernel
// is invoked after the first kernel finished
coop_blocks = max_active_blocks / 2 + 1;
// To make sure the third kernel launched by hipLaunchKernelGGL is invoked
// concurrently with the second kernel
reg_blocks = max_active_blocks - coop_blocks;
break;
case 2:
// Full capacity
coop_blocks = max_active_blocks;
reg_blocks = max_active_blocks;
break;
default:
break;
}
for (int i = 0; i < 3; i++) {
HIP_CHECK(hipStreamCreate(&streams[i]));
}
// Set up data to pass into the kernel
for (int i = 0; i < 3; i++) {
HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&dev_array[i]), warp_size * sizeof(long long)));
HIP_CHECK(hipMemsetAsync(dev_array[i], 0, warp_size * sizeof(long long), streams[i]));
}
HIP_CHECK(hipDeviceSynchronize());
// Launch the kernels
void* coop_params[3][3];
for (int i = 0; i < 3; i++) {
coop_params[i][0] = reinterpret_cast<void*>(&loops);
coop_params[i][1] = reinterpret_cast<void*>(&dev_array[i]);
coop_params[i][2] = reinterpret_cast<void*>(&totalTicks);
}
// We need exclude the the initial launching as it will need time to load code obj.
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), max_active_blocks,
warp_size, coop_params[0], 0, streams[0]));
HIP_CHECK(hipDeviceSynchronize());
// Launching a single cooperative kernel
auto single_start = std::chrono::system_clock::now();
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), max_active_blocks,
warp_size, coop_params[0], 0, streams[0]));
HIP_CHECK(hipDeviceSynchronize());
auto single_end = std::chrono::system_clock::now();
std::chrono::duration<double> single_kernel_time = (single_end - single_start);
// Launching 2 cooperative kernels to different streams
auto double_start = std::chrono::system_clock::now();
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
warp_size, coop_params[0], 0, streams[0]));
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
warp_size, coop_params[1], 0, streams[1]));
HIP_CHECK(hipDeviceSynchronize());
auto double_end = std::chrono::system_clock::now();
// Launching 2 cooperative kernels and 1 normal kernel
std::chrono::duration<double> double_kernel_time = (double_end - double_start);
auto triple_start = std::chrono::system_clock::now();
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
warp_size, coop_params[0], 0, streams[0]));
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_kernel_used), coop_blocks,
warp_size, coop_params[1], 0, streams[1]));
hipLaunchKernelGGL(test_kernel_used, dim3(reg_blocks), dim3(warp_size), 0, streams[2], loops,
dev_array[2], totalTicks);
HIP_CHECK(hipDeviceSynchronize());
auto triple_end = std::chrono::system_clock::now();
std::chrono::duration<double> triple_kernel_time = (triple_end - triple_start);
for (int k = 0; k < 3; ++k) {
HIP_CHECK(hipFree(dev_array[k]));
HIP_CHECK(hipStreamDestroy(streams[k]));
}
INFO("A single kernel took : " << single_kernel_time.count() << " seconds");
INFO("Two cooperative kernels took: " << double_kernel_time.count() << " seconds");
INFO("Two coop kernels and a third regular kernel took: " << triple_kernel_time.count()
<< " seconds");
verify(p_tests, single_kernel_time, double_kernel_time, triple_kernel_time);
}
TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") {
// Use default device for validating the test
int device;
int *A_h, *A_d;
long* B_d;
long* C_d;
hipDeviceProp_t device_properties;
HIP_CHECK(hipGetDevice(&device));
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
if (!device_properties.cooperativeLaunch) {
HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!");
return;
}
size_t buffer_size = kBufferLen * sizeof(int);
A_h = reinterpret_cast<int*>(malloc(buffer_size));
for (uint32_t i = 0; i < kBufferLen; ++i) {
A_h[i] = static_cast<int>(i);
}
HIP_CHECK(hipMalloc(&A_d, buffer_size));
HIP_CHECK(hipMemcpy(A_d, A_h, buffer_size, hipMemcpyHostToDevice));
HIP_CHECK(hipHostMalloc(&C_d, sizeof(long)));
hipStream_t stream;
HIPCHECK(hipStreamCreate(&stream));
dim3 dimBlock = dim3(1);
dim3 dimGrid = dim3(1);
int numBlocks = 0;
uint32_t workgroup = GENERATE(32, 64, 128, 256);
dimBlock.x = workgroup;
// Calculate the device occupancy to know how many blocks can be run concurrently
HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)));
dimGrid.x = device_properties.multiProcessorCount * std::min(numBlocks, 32);
HIP_CHECK(hipMalloc(&B_d, dimGrid.x * sizeof(long)));
void* params[4];
params[0] = (void*)&A_d;
params[1] = (void*)&kBufferLen;
params[2] = (void*)&B_d;
params[3] = (void*)&C_d;
INFO("Testing with grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n");
HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(test_gws), dimGrid, dimBlock, params,
dimBlock.x * sizeof(long), stream));
HIP_CHECK(hipStreamSynchronize(stream));
REQUIRE(((unsigned long long)*C_d) == (((unsigned long long)(kBufferLen) * (kBufferLen - 1)) / 2));
HIP_CHECK(hipStreamDestroy(stream));
HIP_CHECK(hipHostFree(C_d));
HIP_CHECK(hipFree(B_d));
HIP_CHECK(hipFree(A_d));
free(A_h);
}
TEST_CASE("Unit_hipLaunchCooperativeKernel_Streams") {
const auto device = GENERATE(range(0, HipTest::getDeviceCount()));
int p_tests = GENERATE(0, 1, 2);
test_cooperative_streams(device, p_tests);
}
+68
파일 보기
@@ -0,0 +1,68 @@
/*
Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
#define ASSERT_EQUAL(lhs, rhs) HIP_ASSERT(lhs == rhs)
#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs)
#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs)
constexpr int MaxGPUs = 8;
template <typename T>
void printResults(T* ptr, int size) {
for (int i = 0; i < size; i++) {
std::cout << ptr[i] << " ";
}
std::cout << '\n';
}
template <typename T>
void compareResults(T* cpu, T* gpu, int size) {
for (unsigned int i = 0; i < size / sizeof(T); i++) {
if (cpu[i] != gpu[i]) {
INFO("Results do not match at index " << i);
REQUIRE(cpu[i] == gpu[i]);
}
}
}
// Search if the sum exists in the expected results array
template <typename T>
void verifyResults(T* hPtr, T* dPtr, int size) {
int i = 0, j = 0;
for (i = 0; i < size; i++) {
for (j = 0; j < size; j++) {
if (hPtr[i] == dPtr[j]) {
break;
}
}
if (j == size) {
INFO("Result verification failed!");
REQUIRE(j != size);
}
}
}
+30
파일 보기
@@ -0,0 +1,30 @@
set(TEST_SRC
hipGLGetDevices.cc
hipGraphicsGLRegisterBuffer.cc
hipGraphicsGLRegisterImage.cc
hipGraphicsMapResources.cc
hipGraphicsSubResourceGetMappedArray.cc
hipGraphicsResourceGetMappedPointer.cc
hipGraphicsUnmapResources.cc
hipGraphicsUnregisterResource.cc
)
find_package(OpenGL COMPONENTS OpenGL EGL)
message(STATUS "OpenGL_FOUND: ${OpenGL_FOUND}")
if(NOT OpenGL_FOUND)
message(STATUS "OpenGL not found, OpenGL interop tests not enabled.")
return()
endif()
find_package(GLUT)
message(STATUS "GLUT_FOUND: ${GLUT_FOUND}")
if(NOT GLUT_FOUND)
message(STATUS "GLUT not found, OpenGL interop tests not enabled.")
return()
endif()
hip_add_exe_to_target(NAME GLInteropTest
TEST_SRC ${TEST_SRC}
TEST_TARGET_NAME build_tests
COMPILE_OPTIONS -std=c++17)
target_link_libraries(GLInteropTest OpenGL::GL OpenGL::EGL GLUT::GLUT)
+219
파일 보기
@@ -0,0 +1,219 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <variant>
#define GL_GLEXT_PROTOTYPES
#include <GL/freeglut.h>
#include <GL/freeglut_ext.h>
#include <EGL/egl.h>
#include <EGL/eglext.h>
#include <hip_test_common.hh>
class GLBufferObject {
public:
static constexpr size_t kSize = 512 * 512 * 4 * sizeof(float);
GLBufferObject() {
glGenBuffers(1, &vbo_);
glBindBuffer(GL_ARRAY_BUFFER, vbo_);
glBufferData(GL_ARRAY_BUFFER, kSize, 0, GL_DYNAMIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
REQUIRE(glGetError() == GL_NO_ERROR);
}
~GLBufferObject() { glDeleteBuffers(1, &vbo_); }
operator GLuint() const { return vbo_; }
private:
GLuint vbo_;
};
class GLImageObject {
public:
static constexpr size_t kWidth = 512, kHeight = 512;
GLImageObject() {
glGenTextures(1, &tex_);
glBindTexture(GL_TEXTURE_2D, tex_);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, kWidth, kHeight, 0, GL_RGBA_INTEGER_EXT,
GL_UNSIGNED_BYTE, NULL);
REQUIRE(glGetError() == GL_NO_ERROR);
}
~GLImageObject() { glDeleteTextures(1, &tex_); }
operator GLuint() const { return tex_; }
private:
GLuint tex_;
};
static std::once_flag glut_init_flag;
class GLUTContextScopeGuard {
public:
GLUTContextScopeGuard() {
std::call_once(glut_init_flag, &GLUTContextScopeGuard::init);
glut_window_ = glutCreateWindow("");
}
~GLUTContextScopeGuard() { glutDestroyWindow(glut_window_); }
GLUTContextScopeGuard(const GLUTContextScopeGuard&) = delete;
GLUTContextScopeGuard& operator=(const GLUTContextScopeGuard&) = delete;
GLUTContextScopeGuard(GLUTContextScopeGuard&&) = delete;
GLUTContextScopeGuard& operator=(GLUTContextScopeGuard&&) = delete;
private:
int glut_window_;
static void init() {
static char proc_name[] = "";
static std::array<char*, 2> glut_argv = {proc_name, nullptr};
static int glut_argc = 1;
glutInit(&glut_argc, glut_argv.data());
glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE | GLUT_DEPTH);
glutInitWindowSize(512, 512);
}
};
class EGLContextScopeGuard {
public:
EGLContextScopeGuard() {
// 1. Initialize EGL
PFNEGLQUERYDEVICESEXTPROC eglQueryDevicesEXT =
(PFNEGLQUERYDEVICESEXTPROC)eglGetProcAddress("eglQueryDevicesEXT");
eglQueryDevicesEXT(egl_devices_.max_size(), egl_devices_.data(), &num_devices_);
INFO("Detected " << num_devices_ << " devices");
PFNEGLGETPLATFORMDISPLAYEXTPROC eglGetPlatformDisplayEXT =
(PFNEGLGETPLATFORMDISPLAYEXTPROC)eglGetProcAddress("eglGetPlatformDisplayEXT");
egl_display_ = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, egl_devices_.at(0), 0);
REQUIRE(eglInitialize(egl_display_, &major_, &minor_));
// 2. Select an appropriate configuration
REQUIRE(eglChooseConfig(egl_display_, kConfigAttribs, &egl_config_, 1, &num_configs_));
// 3. Create a surface
egl_surface_ = eglCreatePbufferSurface(egl_display_, egl_config_, kPbufferAttribs);
// 4. Bind the API
REQUIRE(eglBindAPI(EGL_OPENGL_API));
// 5. Create a context and make it current
egl_context_ = eglCreateContext(egl_display_, egl_config_, EGL_NO_CONTEXT, NULL);
REQUIRE(eglMakeCurrent(egl_display_, egl_surface_, egl_surface_, egl_context_));
}
~EGLContextScopeGuard() {
// 6. Terminate EGL when finished
eglTerminate(egl_display_);
}
EGLContextScopeGuard(const EGLContextScopeGuard&) = delete;
EGLContextScopeGuard& operator=(const EGLContextScopeGuard&) = delete;
EGLContextScopeGuard(EGLContextScopeGuard&&) = delete;
EGLContextScopeGuard& operator=(EGLContextScopeGuard&&) = delete;
private:
// clang-format off
static constexpr EGLint kConfigAttribs[] = {
EGL_SURFACE_TYPE,
EGL_PBUFFER_BIT,
EGL_BLUE_SIZE, 8,
EGL_GREEN_SIZE, 8,
EGL_RED_SIZE, 8,
EGL_DEPTH_SIZE, 8,
EGL_RENDERABLE_TYPE,
EGL_OPENGL_BIT,
EGL_NONE
};
// clang-format on
static constexpr int kPbufferWidth = 9;
static constexpr int kPbufferHeight = 9;
static constexpr EGLint kPbufferAttribs[] = {
EGL_WIDTH, kPbufferWidth, EGL_HEIGHT, kPbufferHeight, EGL_NONE,
};
std::array<EGLDeviceEXT, 8> egl_devices_;
EGLint num_devices_;
EGLDisplay egl_display_;
EGLint major_, minor_;
EGLint num_configs_;
EGLConfig egl_config_;
EGLSurface egl_surface_;
EGLContext egl_context_;
};
class GLContextScopeGuard {
public:
using GLUTContextScopeGuardPtr = std::unique_ptr<GLUTContextScopeGuard>;
using EGLContextScopeGuardPtr = std::unique_ptr<EGLContextScopeGuard>;
using GLContextScopeGuardVariant =
std::variant<GLUTContextScopeGuardPtr, EGLContextScopeGuardPtr>;
static constexpr char kEnvarName[] = "GL_CONTEXT_TYPE";
GLContextScopeGuard() {
char* val = std::getenv(kEnvarName);
std::string val_str = val == NULL ? "" : val;
if (val_str.empty() || val_str == "GLUT") {
gl_context_ = std::make_unique<GLUTContextScopeGuard>();
} else if (val_str == "EGL") {
gl_context_ = std::make_unique<EGLContextScopeGuard>();
} else {
INFO("Unsupported " << kEnvarName << " value '" << val_str << "'");
INFO("Supported values are ['GLUT', 'EGL']");
REQUIRE(false);
}
}
GLContextScopeGuard(const GLContextScopeGuard&) = delete;
GLContextScopeGuard& operator=(const GLContextScopeGuard&) = delete;
GLContextScopeGuard(GLContextScopeGuard&&) = delete;
GLContextScopeGuard& operator=(GLContextScopeGuard&&) = delete;
private:
GLContextScopeGuardVariant gl_context_;
};
+90
파일 보기
@@ -0,0 +1,90 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <hip/hip_runtime_api.h>
#include <hip/hip_gl_interop.h>
#include "gl_interop_common.hh"
namespace {
constexpr std::array<hipGLDeviceList, 3> kDeviceLists{
hipGLDeviceListAll, hipGLDeviceListCurrentFrame, hipGLDeviceListNextFrame};
} // anonymous namespace
TEST_CASE("Unit_hipGLGetDevices_Positive_Basic") {
GLContextScopeGuard gl_context;
const auto device_list = GENERATE(from_range(begin(kDeviceLists), end(kDeviceLists)));
const int device_count = HipTest::getDeviceCount();
unsigned int gl_device_count = 0;
std::vector<int> gl_devices(device_count, -1);
HIP_CHECK(hipGLGetDevices(&gl_device_count, gl_devices.data(), device_count, device_list));
REQUIRE(gl_device_count == 1);
REQUIRE(gl_devices.at(0) == 0);
}
TEST_CASE("Unit_hipGLGetDevices_Positive_Parameters") {
GLContextScopeGuard gl_context;
const int device_count = HipTest::getDeviceCount();
unsigned int gl_device_count = 0;
std::vector<int> gl_devices(device_count, -1);
SECTION("pHipDeviceCount == nullptr") {
HIP_CHECK(hipGLGetDevices(nullptr, gl_devices.data(), device_count, hipGLDeviceListAll));
REQUIRE(gl_devices.at(0) == 0);
}
SECTION("pHipDevices == nullptr") {
HIP_CHECK(hipGLGetDevices(&gl_device_count, nullptr, device_count, hipGLDeviceListAll));
REQUIRE(gl_device_count == 1);
}
SECTION("hipDeviceCount == 0") {
HIP_CHECK(hipGLGetDevices(&gl_device_count, gl_devices.data(), 0, hipGLDeviceListAll));
REQUIRE(gl_device_count == 1);
REQUIRE(gl_devices.at(0) == -1);
}
}
TEST_CASE("Unit_hipGLGetDevices_Negative_Parameters") {
GLContextScopeGuard gl_context;
const int device_count = HipTest::getDeviceCount();
unsigned int gl_device_count = 0;
std::vector<int> gl_devices(device_count, -1);
SECTION("invalid deviceList") {
HIP_CHECK_ERROR(hipGLGetDevices(&gl_device_count, gl_devices.data(), device_count,
static_cast<hipGLDeviceList>(-1)),
hipErrorInvalidValue);
REQUIRE(gl_device_count == 0);
REQUIRE(gl_devices.at(0) == -1);
}
}
+98
파일 보기
@@ -0,0 +1,98 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <hip/hip_runtime_api.h>
#include <hip/hip_gl_interop.h>
#include "gl_interop_common.hh"
namespace {
constexpr std::array<unsigned int, 3> kFlags{hipGraphicsRegisterFlagsNone,
hipGraphicsRegisterFlagsReadOnly,
hipGraphicsRegisterFlagsWriteDiscard};
} // anonymous namespace
TEST_CASE("Unit_hipGraphicsGLRegisterBuffer_Positive_Basic") {
GLContextScopeGuard gl_context;
const auto flags = GENERATE(from_range(begin(kFlags), end(kFlags)));
GLBufferObject vbo;
hipGraphicsResource* vbo_resource;
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, flags));
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
}
TEST_CASE("Unit_hipGraphicsGLRegisterBuffer_Positive_Register_Twice") {
GLContextScopeGuard gl_context;
GLBufferObject vbo;
hipGraphicsResource *vbo_resource_1, *vbo_resource_2;
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource_1, vbo, hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource_2, vbo, hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource_1));
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource_2));
}
TEST_CASE("Unit_hipGraphicsGLRegisterBuffer_Negative_Parameters") {
GLContextScopeGuard gl_context;
GLBufferObject vbo;
hipGraphicsResource* vbo_resource;
SECTION("resource == nullptr") {
HIP_CHECK_ERROR(hipGraphicsGLRegisterBuffer(nullptr, vbo, hipGraphicsRegisterFlagsNone),
hipErrorInvalidValue);
}
SECTION("invalid buffer") {
HIP_CHECK_ERROR(
hipGraphicsGLRegisterBuffer(&vbo_resource, GLuint{}, hipGraphicsRegisterFlagsNone),
hipErrorInvalidValue);
}
SECTION("invalid flags") {
HIP_CHECK_ERROR(
hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, std::numeric_limits<unsigned int>::max()),
hipErrorInvalidValue);
}
SECTION("flags == hipGraphicsRegisterFlagsSurfaceLoadStore") {
HIP_CHECK_ERROR(
hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsSurfaceLoadStore),
hipErrorInvalidValue);
}
SECTION("flags == hipGraphicsRegisterFlagsTextureGather") {
HIP_CHECK_ERROR(
hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsTextureGather),
hipErrorInvalidValue);
}
}
+102
파일 보기
@@ -0,0 +1,102 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <hip/hip_runtime_api.h>
#include <hip/hip_gl_interop.h>
#include "gl_interop_common.hh"
namespace {
constexpr std::array<unsigned int, 5> kFlags{
hipGraphicsRegisterFlagsNone, hipGraphicsRegisterFlagsReadOnly,
hipGraphicsRegisterFlagsWriteDiscard, hipGraphicsRegisterFlagsSurfaceLoadStore,
hipGraphicsRegisterFlagsTextureGather};
} // anonymous namespace
TEST_CASE("Unit_hipGraphicsGLRegisterImage_Positive_Basic") {
GLContextScopeGuard gl_context;
const auto flags = GENERATE(from_range(begin(kFlags), end(kFlags)));
GLImageObject tex;
hipGraphicsResource* tex_resource;
HIP_CHECK(hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D, flags));
HIP_CHECK(hipGraphicsUnregisterResource(tex_resource));
}
TEST_CASE("Unit_hipGraphicsGLRegisterImage_Positive_Register_Twice") {
GLContextScopeGuard gl_context;
GLImageObject tex;
hipGraphicsResource *tex_resource_1, *tex_resource_2;
HIP_CHECK(hipGraphicsGLRegisterImage(&tex_resource_1, tex, GL_TEXTURE_2D,
hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsGLRegisterImage(&tex_resource_2, tex, GL_TEXTURE_2D,
hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsUnregisterResource(tex_resource_1));
HIP_CHECK(hipGraphicsUnregisterResource(tex_resource_2));
}
TEST_CASE("Unit_hipGraphicsGLRegisterImage_Negative_Parameters") {
GLContextScopeGuard gl_context;
GLImageObject tex;
hipGraphicsResource* tex_resource;
SECTION("resource == nullptr") {
HIP_CHECK_ERROR(
hipGraphicsGLRegisterImage(nullptr, tex, GL_TEXTURE_2D, hipGraphicsRegisterFlagsNone),
hipErrorInvalidValue);
}
SECTION("invalid image") {
HIP_CHECK_ERROR(hipGraphicsGLRegisterImage(&tex_resource, GLuint{}, GL_TEXTURE_2D,
hipGraphicsRegisterFlagsNone),
hipErrorInvalidValue);
}
SECTION("invalid target") {
HIP_CHECK_ERROR(
hipGraphicsGLRegisterImage(&tex_resource, tex, GL_BUFFER, hipGraphicsRegisterFlagsNone),
hipErrorInvalidValue);
}
SECTION("target does not match the object") {
HIP_CHECK_ERROR(hipGraphicsGLRegisterImage(&tex_resource, tex, GL_RENDERBUFFER,
hipGraphicsRegisterFlagsNone),
hipErrorInvalidValue);
}
SECTION("invalid flags") {
HIP_CHECK_ERROR(hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D,
std::numeric_limits<unsigned int>::max()),
hipErrorInvalidValue);
}
}
+93
파일 보기
@@ -0,0 +1,93 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <hip/hip_runtime_api.h>
#include <hip/hip_gl_interop.h>
#include "gl_interop_common.hh"
TEST_CASE("Unit_hipGraphicsMapResources_Positive_Basic") {
GLContextScopeGuard gl_context;
GLBufferObject vbo;
GLImageObject tex;
std::array<hipGraphicsResource_t, 2> resources;
HIP_CHECK(hipGraphicsGLRegisterBuffer(&resources.at(0), vbo, hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsGLRegisterImage(&resources.at(1), tex, GL_TEXTURE_2D,
hipGraphicsRegisterFlagsNone));
hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
HIP_CHECK(hipGraphicsMapResources(resources.size(), resources.data(), stream));
HIP_CHECK(hipGraphicsUnmapResources(resources.size(), resources.data(), stream));
HIP_CHECK(hipStreamDestroy(stream));
HIP_CHECK(hipGraphicsUnregisterResource(resources.at(0)));
HIP_CHECK(hipGraphicsUnregisterResource(resources.at(1)));
}
TEST_CASE("Unit_hipGraphicsMapResources_Negative_Parameters") {
GLContextScopeGuard gl_context;
GLBufferObject vbo;
hipGraphicsResource* vbo_resource;
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
SECTION("count == 0") {
HIP_CHECK_ERROR(hipGraphicsMapResources(0, &vbo_resource, 0), hipErrorInvalidValue);
}
SECTION("resources == nullptr") {
HIP_CHECK_ERROR(hipGraphicsMapResources(1, nullptr, 0), hipErrorInvalidValue);
}
SECTION("unregistered resource") {
hipGraphicsResource* unregistered_resource;
HIP_CHECK(
hipGraphicsGLRegisterBuffer(&unregistered_resource, vbo, hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsUnregisterResource(unregistered_resource));
HIP_CHECK_ERROR(hipGraphicsMapResources(1, &unregistered_resource, 0), hipErrorInvalidHandle);
}
SECTION("already mapped resource") {
HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
HIP_CHECK_ERROR(hipGraphicsMapResources(1, &vbo_resource, 0), hipErrorAlreadyMapped);
HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
}
SECTION("invalid stream") {
hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
HIP_CHECK(hipStreamDestroy(stream));
HIP_CHECK_ERROR(hipGraphicsMapResources(1, &vbo_resource, stream), hipErrorContextIsDestroyed);
}
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
}
@@ -0,0 +1,151 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <hip/hip_runtime_api.h>
#include <hip/hip_gl_interop.h>
#include "gl_interop_common.hh"
TEST_CASE("Unit_hipGraphicsResourceGetMappedPointer_Positive_Basic") {
GLContextScopeGuard gl_context;
GLBufferObject vbo;
hipGraphicsResource* vbo_resource;
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
float* buffer_devptr = nullptr;
size_t size = 0;
HIP_CHECK(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr), &size,
vbo_resource));
REQUIRE(buffer_devptr != nullptr);
REQUIRE(size == vbo.kSize);
HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
}
TEST_CASE("Unit_hipGraphicsResourceGetMappedPointer_Positive_Parameters") {
GLContextScopeGuard gl_context;
GLBufferObject vbo;
hipGraphicsResource* vbo_resource;
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
float* buffer_devptr = nullptr;
size_t size = 0;
SECTION("devPtr == nullptr") {
HIP_CHECK(hipGraphicsResourceGetMappedPointer(nullptr, &size, vbo_resource));
REQUIRE(size == vbo.kSize);
}
SECTION("size == nullptr") {
HIP_CHECK(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr), nullptr,
vbo_resource));
REQUIRE(buffer_devptr != nullptr);
}
HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
}
TEST_CASE("Unit_hipGraphicsResourceGetMappedPointer_Negative_Parameters") {
GLContextScopeGuard gl_context;
GLBufferObject vbo;
hipGraphicsResource* vbo_resource;
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
float* buffer_devptr = nullptr;
size_t size = 0;
SECTION("non-pointer resource") {
GLImageObject tex;
hipGraphicsResource* tex_resource;
HIP_CHECK(hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D,
hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsMapResources(1, &tex_resource, 0));
HIP_CHECK_ERROR(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr),
&size, tex_resource),
hipErrorNotMappedAsPointer);
HIP_CHECK(hipGraphicsUnmapResources(1, &tex_resource, 0));
HIP_CHECK(hipGraphicsUnregisterResource(tex_resource));
}
SECTION("unregistered resource") {
hipGraphicsResource* unregistered_resource;
HIP_CHECK(
hipGraphicsGLRegisterBuffer(&unregistered_resource, vbo, hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsUnregisterResource(unregistered_resource));
HIP_CHECK_ERROR(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr),
&size, unregistered_resource),
hipErrorContextIsDestroyed);
}
SECTION("not mapped resource") {
hipGraphicsResource* not_mapped_resource;
HIP_CHECK(hipGraphicsGLRegisterBuffer(&not_mapped_resource, vbo, hipGraphicsRegisterFlagsNone));
HIP_CHECK_ERROR(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr),
&size, not_mapped_resource),
hipErrorNotMapped);
HIP_CHECK(hipGraphicsUnregisterResource(not_mapped_resource));
}
SECTION("unmapped resource") {
hipGraphicsResource* unmapped_resource;
HIP_CHECK(hipGraphicsGLRegisterBuffer(&unmapped_resource, vbo, hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsMapResources(1, &unmapped_resource, 0));
HIP_CHECK(hipGraphicsUnmapResources(1, &unmapped_resource, 0));
HIP_CHECK_ERROR(hipGraphicsResourceGetMappedPointer(reinterpret_cast<void**>(&buffer_devptr),
&size, unmapped_resource),
hipErrorNotMapped);
HIP_CHECK(hipGraphicsUnregisterResource(unmapped_resource));
}
HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
}
@@ -0,0 +1,132 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <hip/hip_runtime_api.h>
#include <hip/hip_gl_interop.h>
#include "gl_interop_common.hh"
TEST_CASE("Unit_hipGraphicsSubResourceGetMappedArray_Positive_Basic") {
GLContextScopeGuard gl_context;
GLImageObject tex;
hipGraphicsResource* tex_resource;
HIP_CHECK(
hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D, hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsMapResources(1, &tex_resource, 0));
hipArray* image_devptr = nullptr;
HIP_CHECK(hipGraphicsSubResourceGetMappedArray(&image_devptr, tex_resource, 0, 0));
REQUIRE(image_devptr != nullptr);
HIP_CHECK(hipGraphicsUnmapResources(1, &tex_resource, 0));
HIP_CHECK(hipGraphicsUnregisterResource(tex_resource));
}
TEST_CASE("Unit_hipGraphicsSubResourceGetMappedArray_Negative_Parameters") {
GLContextScopeGuard gl_context;
GLImageObject tex;
hipGraphicsResource* tex_resource;
HIP_CHECK(
hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D, hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsMapResources(1, &tex_resource, 0));
hipArray* image_devptr = nullptr;
SECTION("array == nullptr") {
HIP_CHECK(hipGraphicsSubResourceGetMappedArray(nullptr, tex_resource, 0, 0));
}
SECTION("non-texture resource") {
GLBufferObject vbo;
hipGraphicsResource* vbo_resource;
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, vbo_resource, 0, 0),
hipErrorNotMappedAsArray);
HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
}
SECTION("unregistered resource") {
hipGraphicsResource* unregistered_resource;
HIP_CHECK(hipGraphicsGLRegisterImage(&unregistered_resource, tex, GL_TEXTURE_2D,
hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsUnregisterResource(unregistered_resource));
HIP_CHECK_ERROR(
hipGraphicsSubResourceGetMappedArray(&image_devptr, unregistered_resource, 0, 0),
hipErrorContextIsDestroyed);
}
SECTION("not mapped resource") {
hipGraphicsResource* not_mapped_resource;
HIP_CHECK(hipGraphicsGLRegisterImage(&not_mapped_resource, tex, GL_TEXTURE_2D,
hipGraphicsRegisterFlagsNone));
HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, not_mapped_resource, 0, 0),
hipErrorNotMapped);
HIP_CHECK(hipGraphicsUnregisterResource(not_mapped_resource));
}
SECTION("unmapped resource") {
hipGraphicsResource* unmapped_resource;
HIP_CHECK(hipGraphicsGLRegisterImage(&unmapped_resource, tex, GL_TEXTURE_2D,
hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsMapResources(1, &unmapped_resource, 0));
HIP_CHECK(hipGraphicsUnmapResources(1, &unmapped_resource, 0));
HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, unmapped_resource, 0, 0),
hipErrorNotMapped);
HIP_CHECK(hipGraphicsUnregisterResource(unmapped_resource));
}
SECTION("invalid arrayIndex") {
HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, tex_resource,
std::numeric_limits<int>::max(), 0),
hipErrorInvalidValue);
}
SECTION("invalid mipLevel") {
HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, tex_resource, 0,
std::numeric_limits<int>::max()),
hipErrorInvalidValue);
}
HIP_CHECK(hipGraphicsUnmapResources(1, &tex_resource, 0));
HIP_CHECK(hipGraphicsUnregisterResource(tex_resource));
}
+66
파일 보기
@@ -0,0 +1,66 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <hip/hip_runtime_api.h>
#include <hip/hip_gl_interop.h>
#include "gl_interop_common.hh"
TEST_CASE("Unit_hipGraphicsUnmapResources_Negative_Parameters") {
GLContextScopeGuard gl_context;
GLBufferObject vbo;
hipGraphicsResource* vbo_resource;
HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0));
SECTION("count == 0") {
HIP_CHECK_ERROR(hipGraphicsUnmapResources(0, &vbo_resource, 0), hipErrorInvalidValue);
}
SECTION("resources == nullptr") {
HIP_CHECK_ERROR(hipGraphicsUnmapResources(1, nullptr, 0), hipErrorInvalidValue);
}
SECTION("not mapped resource") {
hipGraphicsResource* not_mapped_resource;
HIP_CHECK(hipGraphicsGLRegisterBuffer(&not_mapped_resource, vbo, hipGraphicsRegisterFlagsNone));
HIP_CHECK_ERROR(hipGraphicsUnmapResources(1, &not_mapped_resource, 0), hipErrorNotMapped);
HIP_CHECK(hipGraphicsUnregisterResource(not_mapped_resource));
}
SECTION("invalid stream") {
hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
HIP_CHECK(hipStreamDestroy(stream));
HIP_CHECK_ERROR(hipGraphicsUnmapResources(1, &vbo_resource, stream),
hipErrorContextIsDestroyed);
}
HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0));
HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource));
}
+48
파일 보기
@@ -0,0 +1,48 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <hip/hip_runtime_api.h>
#include <hip/hip_gl_interop.h>
#include "gl_interop_common.hh"
TEST_CASE("Unit_hipGraphicsUnregisterResource_Negative_Parameters") {
GLContextScopeGuard gl_context;
GLBufferObject vbo;
SECTION("already unregistered resource") {
hipGraphicsResource* unregistered_resource;
HIP_CHECK(
hipGraphicsGLRegisterBuffer(&unregistered_resource, vbo, hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsUnregisterResource(unregistered_resource));
HIP_CHECK_ERROR(hipGraphicsUnregisterResource(unregistered_resource), hipErrorInvalidContext);
}
SECTION("mapped resource") {
hipGraphicsResource* mapped_resource;
HIP_CHECK(hipGraphicsGLRegisterBuffer(&mapped_resource, vbo, hipGraphicsRegisterFlagsNone));
HIP_CHECK(hipGraphicsMapResources(1, &mapped_resource, 0));
HIP_CHECK_ERROR(hipGraphicsUnregisterResource(mapped_resource), hipErrorAlreadyMapped);
}
}
+1
파일 보기
@@ -103,6 +103,7 @@ set(TEST_SRC
hipGraphKernelNodeSetParams.cc
hipGraphExecKernelNodeSetParams.cc
hipGraphLaunch.cc
hipGraphLaunch_old.cc
hipGraphMemcpyNodeSetParams1D.cc
hipGraphExecMemcpyNodeSetParamsToSymbol_old.cc
hipGraphExecMemcpyNodeSetParamsToSymbol.cc
+91 -66
파일 보기
@@ -40,19 +40,26 @@ end. Instantiate and Launch the Graph. Wait for the event to complete.
Verify that hipEventElapsedTime() returns error.
6) Validate scenario 2 by running the graph multiple times in a loop
(100 times) after instantiation.
7) Negative Scenarios
7) Validate that no error is reported when numDeps <= dependencies length
8) Negative Scenarios
- Output node is a nullptr.
- Input graph is a nullptr.
- Input dependencies is a nullptr.
- Node in dependency is from different graph
- Invalid numNodes
- Duplicate node in dependencies
- Input event is a nullptr.
- Input graph is uninitialized.
- Input event is uninitialized.
*/
#include <functional>
#include <hip_test_common.hh>
#include <hip_test_checkers.hh>
#include <hip_test_common.hh>
#include <hip_test_kernels.hh>
#include "graph_tests_common.hh"
/**
* Scenario 1: Create s simple graph with just one event record
* node and instantiate and launch the graph.
@@ -66,8 +73,7 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_Simple") {
hipEvent_t event;
HIP_CHECK(hipEventCreate(&event));
hipGraphNode_t eventrec;
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
event));
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event));
// Instantiate and launch the graph
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
@@ -82,8 +88,8 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_Simple") {
/**
* Local test function
*/
static void validateAddEventRecordNode(bool measureTime, bool withFlags,
int nstep, unsigned flag = 0) {
static void validateAddEventRecordNode(bool measureTime, bool withFlags, int nstep,
unsigned flag = 0) {
constexpr size_t N = 1024;
constexpr size_t Nbytes = N * sizeof(int);
constexpr auto blocksPerCU = 6; // to hide latency
@@ -111,8 +117,7 @@ static void validateAddEventRecordNode(bool measureTime, bool withFlags,
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
&memsetParams));
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
memset(&memsetParams, 0, sizeof(memsetParams));
memsetParams.dst = reinterpret_cast<void*>(B_d);
memsetParams.value = 0;
@@ -120,38 +125,34 @@ static void validateAddEventRecordNode(bool measureTime, bool withFlags,
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memset_B, graph, nullptr, 0,
&memsetParams));
HIP_CHECK(hipGraphAddMemsetNode(&memset_B, graph, nullptr, 0, &memsetParams));
void* kernelArgs1[] = {&C_d, &memsetVal, reinterpret_cast<void *>(&NElem)};
kernelNodeParams.func =
reinterpret_cast<void *>(HipTest::memsetReverse<int>);
void* kernelArgs1[] = {&C_d, &memsetVal, reinterpret_cast<void*>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::memsetReverse<int>);
kernelNodeParams.gridDim = dim3(blocks);
kernelNodeParams.blockDim = dim3(threadsPerBlock);
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs1);
kernelNodeParams.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&memsetKer_C, graph, nullptr, 0,
&kernelNodeParams));
HIP_CHECK(hipGraphAddKernelNode(&memsetKer_C, graph, nullptr, 0, &kernelNodeParams));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_A, graph, nullptr, 0, A_d,
A_h, Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_A, graph, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_B, graph, nullptr, 0, B_d,
B_h, Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_B, graph, nullptr, 0, B_d, B_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_C, graph, nullptr, 0, C_h,
C_d, Nbytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_C, graph, nullptr, 0, C_h, C_d, Nbytes,
hipMemcpyDeviceToHost));
void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
kernelNodeParams.gridDim = dim3(blocks);
kernelNodeParams.blockDim = dim3(threadsPerBlock);
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs2);
kernelNodeParams.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&ker_vecAdd, graph, nullptr, 0,
&kernelNodeParams));
HIP_CHECK(hipGraphAddKernelNode(&ker_vecAdd, graph, nullptr, 0, &kernelNodeParams));
hipEvent_t eventstart, eventend;
if (withFlags) {
HIP_CHECK(hipEventCreateWithFlags(&eventstart, flag));
@@ -161,10 +162,8 @@ static void validateAddEventRecordNode(bool measureTime, bool withFlags,
HIP_CHECK(hipEventCreate(&eventend));
}
hipGraphNode_t event_start, event_final;
HIP_CHECK(hipGraphAddEventRecordNode(&event_start, graph, nullptr, 0,
eventstart));
HIP_CHECK(hipGraphAddEventRecordNode(&event_final, graph, nullptr, 0,
eventend));
HIP_CHECK(hipGraphAddEventRecordNode(&event_start, graph, nullptr, 0, eventstart));
HIP_CHECK(hipGraphAddEventRecordNode(&event_final, graph, nullptr, 0, eventend));
// Create dependencies
HIP_CHECK(hipGraphAddDependencies(graph, &event_start, &memset_A, 1));
HIP_CHECK(hipGraphAddDependencies(graph, &event_start, &memset_B, 1));
@@ -260,7 +259,7 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_TimingDisabled") {
HIP_CHECK(hipEventCreateWithFlags(&event_start, hipEventDisableTiming));
HIP_CHECK(hipEventCreateWithFlags(&event_end, hipEventDisableTiming));
// memset node
char *A_d;
char* A_d;
hipGraphNode_t memset_A;
hipMemsetParams memsetParams{};
HIP_CHECK(hipMalloc(&A_d, Nbytes));
@@ -271,14 +270,11 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_TimingDisabled") {
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
&memsetParams));
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
hipGraphNode_t event_node_start, event_node_end;
HIP_CHECK(hipGraphAddEventRecordNode(&event_node_start, graph, nullptr, 0,
event_start));
HIP_CHECK(hipGraphAddEventRecordNode(&event_node_end, graph, nullptr, 0,
event_end));
HIP_CHECK(hipGraphAddEventRecordNode(&event_node_start, graph, nullptr, 0, event_start));
HIP_CHECK(hipGraphAddEventRecordNode(&event_node_end, graph, nullptr, 0, event_end));
// Add dependencies between nodes
HIP_CHECK(hipGraphAddDependencies(graph, &event_node_start, &memset_A, 1));
HIP_CHECK(hipGraphAddDependencies(graph, &memset_A, &event_node_end, 1));
@@ -290,7 +286,7 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_TimingDisabled") {
// Validate hipEventElapsedTime returns error code because timing is
// disabled for start and end event nodes.
float t;
REQUIRE(hipSuccess != hipEventElapsedTime(&t, event_start, event_end));
HIP_CHECK_ERROR(hipEventElapsedTime(&t, event_start, event_end), hipErrorInvalidHandle);
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipFree(A_d));
@@ -301,44 +297,73 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_TimingDisabled") {
}
/**
* Scenario 7: All negative tests
* Scenario 7: Positive parameter tests
*/
TEST_CASE("Unit_hipGraphAddEventRecordNode_Negative") {
TEST_CASE("Unit_hipGraphAddEventRecordNode_Positive_Parameters") {
hipGraph_t graph;
HIP_CHECK(hipGraphCreate(&graph, 0));
hipEvent_t event;
HIP_CHECK(hipEventCreate(&event));
hipGraphNode_t eventwait;
SECTION("pGraphNode = nullptr") {
REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(nullptr,
graph, nullptr, 0, event));
hipGraphNode_t eventrec;
hipGraphNode_t dep_node = nullptr;
hipGraphNode_t dep_node2 = nullptr;
HIP_CHECK(hipGraphAddEmptyNode(&dep_node, graph, nullptr, 0));
HIP_CHECK(hipGraphAddEmptyNode(&dep_node2, graph, nullptr, 0));
hipGraphNode_t dep_nodes[] = {dep_node, dep_node2};
size_t numDeps = 0;
SECTION("numDependencies is zero, dependencies is not nullptr") {
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, dep_nodes, 0, event));
HIP_CHECK(hipGraphNodeGetDependencies(eventrec, nullptr, &numDeps));
REQUIRE(numDeps == 0);
}
SECTION("graph = nullptr") {
REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait,
nullptr, nullptr, 0, event));
SECTION("numDependencies < dependencies length") {
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, dep_nodes, 1, event));
HIP_CHECK(hipGraphNodeGetDependencies(eventrec, nullptr, &numDeps));
REQUIRE(numDeps == 1);
}
SECTION("pDependencies = nullptr and numDependencies != 0") {
REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait,
graph, nullptr, 1, event));
}
SECTION("event = nullptr") {
REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait,
graph, nullptr, 0, nullptr));
}
SECTION("graph is uninitialized") {
hipGraph_t graph_uninit{};
REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait,
graph_uninit, nullptr, 0, nullptr));
}
SECTION("event is uninitialized") {
hipEvent_t event_uninit{};
REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait,
graph, nullptr, 0, event_uninit));
SECTION("numDependencies == dependencies length") {
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, dep_nodes, 2, event));
HIP_CHECK(hipGraphNodeGetDependencies(eventrec, nullptr, &numDeps));
REQUIRE(numDeps == 2);
}
HIP_CHECK(hipGraphDestroy(graph));
HIP_CHECK(hipEventDestroy(event));
}
/**
* Scenario 8: All negative tests
*/
TEST_CASE("Unit_hipGraphAddEventRecordNode_Negative") {
using namespace std::placeholders;
hipGraph_t graph;
HIP_CHECK(hipGraphCreate(&graph, 0));
hipEvent_t event;
HIP_CHECK(hipEventCreate(&event));
hipGraphNode_t eventrec;
GraphAddNodeCommonNegativeTests(std::bind(hipGraphAddEventRecordNode, _1, _2, _3, _4, event),
graph);
SECTION("event = nullptr") {
HIP_CHECK_ERROR(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, nullptr),
hipErrorInvalidValue);
}
SECTION("graph is uninitialized") {
hipGraph_t graph_uninit{};
HIP_CHECK_ERROR(hipGraphAddEventRecordNode(&eventrec, graph_uninit, nullptr, 0, event),
hipErrorInvalidValue);
}
SECTION("event is uninitialized") {
hipEvent_t event_uninit{};
HIP_CHECK_ERROR(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event_uninit),
hipErrorInvalidValue);
}
HIP_CHECK(hipGraphDestroy(graph));
+99 -81
파일 보기
@@ -32,20 +32,25 @@ both graphs.
(100 times).
4) Execute scenario 2 with stream1 = stream2.
5) Repeat scenario 2 for different event flags.
6) Negative Scenarios
6) Validate that no error is reported when numDeps <= dependencies length
7) Negative Scenarios
- Pass input node parameter as nullptr.
- Pass input graph parameter as nullptr.
- Pass input dependency parameter as nullptr.
- Node in dependency is from different graph
- Invalid numNodes
- Duplicate node in dependencies
- Pass input event parameter as nullptr.
- Pass uninitialized input graph parameter.
- Pass uninitialized input event parameter.
*/
#include <functional>
#include <hip_test_common.hh>
#include <hip_test_checkers.hh>
#include <hip_test_common.hh>
#include <hip_test_kernels.hh>
#define LEN 512
#include "graph_tests_common.hh"
/**
* Scenario 1
@@ -60,13 +65,10 @@ TEST_CASE("Unit_hipGraphAddEventWaitNode_Functional_Simple") {
HIP_CHECK(hipEventCreate(&event));
hipGraphNode_t event_rec_node, event_wait_node;
// Create a event record node in graph
HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph, nullptr, 0,
event));
HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph, nullptr, 0, event));
// Create a event wait node in graph
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0,
event));
HIP_CHECK(hipGraphAddDependencies(graph, &event_rec_node,
&event_wait_node, 1));
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0, event));
HIP_CHECK(hipGraphAddDependencies(graph, &event_rec_node, &event_wait_node, 1));
// Instantiate and launch the graph
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
@@ -80,13 +82,14 @@ TEST_CASE("Unit_hipGraphAddEventWaitNode_Functional_Simple") {
/**
* Local Function
*/
static void validate_hipGraphAddEventWaitNode_internodedep(int test,
int nstep, unsigned flag = hipEventDefault) {
size_t memsize = LEN * sizeof(int);
static void validate_hipGraphAddEventWaitNode_internodedep(int test, int nstep,
unsigned flag = hipEventDefault) {
constexpr size_t N = 1024;
size_t memsize = N * sizeof(int);
constexpr auto blocksPerCU = 6; // to hide latency
constexpr auto threadsPerBlock = 256;
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, LEN);
size_t NElem{LEN};
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
size_t NElem{N};
hipGraph_t graph1, graph2;
hipStream_t streamForGraph1, streamForGraph2;
hipGraphExec_t graphExec1, graphExec2;
@@ -114,68 +117,57 @@ static void validate_hipGraphAddEventWaitNode_internodedep(int test,
HIP_CHECK(hipMalloc(&out_d_g1, memsize));
HIP_CHECK(hipMalloc(&out_d_g2, memsize));
// Initialize host buffer
for (uint32_t i = 0; i < LEN; i++) {
for (uint32_t i = 0; i < N; i++) {
inp_h[i] = i;
out_h_g1[i] = 0;
out_h_g2[i] = 0;
}
// Graph1 creation ...........
// Create event1 record node in graph1
HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph1, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph1, nullptr, 0, event1));
// Create memcpy and kernel nodes for graph1
hipGraphNode_t memcpyH2D, memcpyD2H_1, kernelnode_1;
hipKernelNodeParams kernelNodeParams1{};
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d,
inp_h, memsize, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_1, graph1, nullptr, 0,
out_h_g1, out_d_g1, memsize, hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d, inp_h, memsize,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_1, graph1, nullptr, 0, out_h_g1, out_d_g1, memsize,
hipMemcpyDeviceToHost));
void* kernelArgs1[] = {&inp_d, &out_d_g1, reinterpret_cast<void *>(&NElem)};
kernelNodeParams1.func =
reinterpret_cast<void *>(HipTest::vector_square<int>);
void* kernelArgs1[] = {&inp_d, &out_d_g1, reinterpret_cast<void*>(&NElem)};
kernelNodeParams1.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
kernelNodeParams1.gridDim = dim3(blocks);
kernelNodeParams1.blockDim = dim3(threadsPerBlock);
kernelNodeParams1.sharedMemBytes = 0;
kernelNodeParams1.kernelParams = reinterpret_cast<void**>(kernelArgs1);
kernelNodeParams1.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph1, nullptr, 0,
&kernelNodeParams1));
HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph1, nullptr, 0, &kernelNodeParams1));
// Create dependencies for graph1
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D,
&event_rec_node, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &event_rec_node,
&kernelnode_1, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode_1,
&memcpyD2H_1, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D, &event_rec_node, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &event_rec_node, &kernelnode_1, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode_1, &memcpyD2H_1, 1));
// Graph2 creation ...........
// Create event1 record node in graph2
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0, event1));
// Create memcpy and kernel nodes for graph2
hipGraphNode_t memcpyD2H_2, kernelnode_2;
hipKernelNodeParams kernelNodeParams2{};
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_2, graph2, nullptr, 0,
out_h_g2, out_d_g2, memsize, hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_2, graph2, nullptr, 0, out_h_g2, out_d_g2, memsize,
hipMemcpyDeviceToHost));
void* kernelArgs2[] = {&inp_d, &out_d_g2, reinterpret_cast<void *>(&NElem)};
kernelNodeParams2.func =
reinterpret_cast<void *>(HipTest::vector_cubic<int>);
void* kernelArgs2[] = {&inp_d, &out_d_g2, reinterpret_cast<void*>(&NElem)};
kernelNodeParams2.func = reinterpret_cast<void*>(HipTest::vector_cubic<int>);
kernelNodeParams2.gridDim = dim3(blocks);
kernelNodeParams2.blockDim = dim3(threadsPerBlock);
kernelNodeParams2.sharedMemBytes = 0;
kernelNodeParams2.kernelParams = reinterpret_cast<void**>(kernelArgs2);
kernelNodeParams2.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph2, nullptr, 0,
&kernelNodeParams2));
HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph2, nullptr, 0, &kernelNodeParams2));
// Create dependencies for graph2
HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node,
&kernelnode_2, 1));
HIP_CHECK(hipGraphAddDependencies(graph2, &kernelnode_2,
&memcpyD2H_2, 1));
HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node, &kernelnode_2, 1));
HIP_CHECK(hipGraphAddDependencies(graph2, &kernelnode_2, &memcpyD2H_2, 1));
// Instantiate and launch the graphs
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
@@ -187,16 +179,16 @@ static void validate_hipGraphAddEventWaitNode_internodedep(int test,
HIP_CHECK(hipStreamSynchronize(streamForGraph2));
// Validate output
bool btestPassed1 = true;
for (uint32_t i = 0; i < LEN; i++) {
if (out_h_g1[i] != (inp_h[i]*inp_h[i])) {
for (uint32_t i = 0; i < N; i++) {
if (out_h_g1[i] != (inp_h[i] * inp_h[i])) {
btestPassed1 = false;
break;
}
}
REQUIRE(btestPassed1 == true);
bool btestPassed2 = true;
for (uint32_t i = 0; i < LEN; i++) {
if (out_h_g2[i] != (inp_h[i]*inp_h[i]*inp_h[i])) {
for (uint32_t i = 0; i < N; i++) {
if (out_h_g2[i] != (inp_h[i] * inp_h[i] * inp_h[i])) {
btestPassed2 = false;
break;
}
@@ -247,55 +239,81 @@ TEST_CASE("Unit_hipGraphAddEventWaitNode_MultGraphOneStrmDependency") {
*/
TEST_CASE("Unit_hipGraphAddEventWaitNode_differentFlags") {
SECTION("flag = hipEventBlockingSync") {
validate_hipGraphAddEventWaitNode_internodedep(0, 1,
hipEventBlockingSync);
validate_hipGraphAddEventWaitNode_internodedep(0, 1, hipEventBlockingSync);
}
SECTION("graph = hipEventDisableTiming") {
validate_hipGraphAddEventWaitNode_internodedep(0, 1,
hipEventDisableTiming);
validate_hipGraphAddEventWaitNode_internodedep(0, 1, hipEventDisableTiming);
}
}
/**
* Scenario 6
* Scenario 6: Positive parameter tests
*/
TEST_CASE("Unit_hipGraphAddEventWaitNode_Negative") {
TEST_CASE("Unit_hipGraphAddEventWaitNode_Positive_Parameters") {
hipGraph_t graph;
HIP_CHECK(hipGraphCreate(&graph, 0));
hipEvent_t event;
HIP_CHECK(hipEventCreate(&event));
hipGraphNode_t eventwait;
SECTION("pGraphNode = nullptr") {
REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(nullptr,
graph, nullptr, 0, event));
hipGraphNode_t dep_node = nullptr;
hipGraphNode_t dep_node2 = nullptr;
HIP_CHECK(hipGraphAddEmptyNode(&dep_node, graph, nullptr, 0));
HIP_CHECK(hipGraphAddEmptyNode(&dep_node2, graph, nullptr, 0));
hipGraphNode_t dep_nodes[] = {dep_node, dep_node2};
size_t numDeps = 0;
SECTION("numDependencies is zero, dependencies is not nullptr") {
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, dep_nodes, 0, event));
HIP_CHECK(hipGraphNodeGetDependencies(eventwait, nullptr, &numDeps));
REQUIRE(numDeps == 0);
}
SECTION("graph = nullptr") {
REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait,
nullptr, nullptr, 0, event));
SECTION("numDependencies < dependencies length") {
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, dep_nodes, 1, event));
HIP_CHECK(hipGraphNodeGetDependencies(eventwait, nullptr, &numDeps));
REQUIRE(numDeps == 1);
}
SECTION("pDependencies = nullptr") {
REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait,
graph, nullptr, 1, event));
}
SECTION("event = nullptr") {
REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait,
graph, nullptr, 0, nullptr));
}
SECTION("graph is uninitialized") {
hipGraph_t graph_uninit{};
REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait,
graph_uninit, nullptr, 0, event));
}
SECTION("event is uninitialized") {
hipEvent_t event_uninit{};
REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait,
graph, nullptr, 0, event_uninit));
SECTION("numDependencies == dependencies length") {
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, dep_nodes, 2, event));
HIP_CHECK(hipGraphNodeGetDependencies(eventwait, nullptr, &numDeps));
REQUIRE(numDeps == 2);
}
HIP_CHECK(hipGraphDestroy(graph));
HIP_CHECK(hipEventDestroy(event));
}
/**
* Scenario 7
*/
TEST_CASE("Unit_hipGraphAddEventWaitNode_Negative") {
using namespace std::placeholders;
hipGraph_t graph;
HIP_CHECK(hipGraphCreate(&graph, 0));
hipEvent_t event;
HIP_CHECK(hipEventCreate(&event));
hipGraphNode_t eventwait;
GraphAddNodeCommonNegativeTests(std::bind(hipGraphAddEventWaitNode, _1, _2, _3, _4, event),
graph);
SECTION("event = nullptr") {
HIP_CHECK_ERROR(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, nullptr),
hipErrorInvalidValue);
}
SECTION("graph is uninitialized") {
hipGraph_t graph_uninit{};
HIP_CHECK_ERROR(hipGraphAddEventWaitNode(&eventwait, graph_uninit, nullptr, 0, event),
hipErrorInvalidValue);
}
SECTION("event is uninitialized") {
hipEvent_t event_uninit{};
HIP_CHECK_ERROR(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event_uninit),
hipErrorInvalidValue);
}
HIP_CHECK(hipGraphDestroy(graph));
+25 -22
파일 보기
@@ -26,11 +26,12 @@ with the event set in hipGraphAddEventRecordNode.
- Output event is a nullptr.
- Input node is an empty node.
- Input node is a memset node.
- Input node is event wait node
- Input node is an uninitialized node.
*/
#include <hip_test_common.hh>
#include <hip_test_checkers.hh>
#include <hip_test_common.hh>
#include <hip_test_kernels.hh>
/**
@@ -42,8 +43,7 @@ static void validateEventRecordNodeGetEvent(unsigned flag) {
hipEvent_t event, event_out;
HIP_CHECK(hipEventCreateWithFlags(&event, flag));
hipGraphNode_t eventrec;
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
event));
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event));
HIP_CHECK(hipGraphEventRecordNodeGetEvent(eventrec, &event_out));
// validate set event and get event are same
REQUIRE(event == event_out);
@@ -77,31 +77,32 @@ TEST_CASE("Unit_hipGraphEventRecordNodeGetEvent_Functional") {
TEST_CASE("Unit_hipGraphEventRecordNodeGetEvent_Negative") {
hipGraph_t graph;
HIP_CHECK(hipGraphCreate(&graph, 0));
hipEvent_t event, event_out;
HIP_CHECK(hipEventCreate(&event));
hipGraphNode_t eventrec;
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
event));
hipEvent_t event_out;
hipEvent_t event1, event2;
HIP_CHECK(hipEventCreate(&event1));
HIP_CHECK(hipEventCreate(&event2));
hipGraphNode_t eventrec, eventwait;
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event2));
SECTION("node = nullptr") {
REQUIRE(hipErrorInvalidValue == hipGraphEventRecordNodeGetEvent(nullptr,
&event_out));
HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(nullptr, &event_out), hipErrorInvalidValue);
}
SECTION("event_out = nullptr") {
REQUIRE(hipErrorInvalidValue == hipGraphEventRecordNodeGetEvent(eventrec,
nullptr));
HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(eventrec, nullptr), hipErrorInvalidValue);
}
SECTION("input node is empty node") {
hipGraphNode_t EmptyGraphNode;
HIP_CHECK(hipGraphAddEmptyNode(&EmptyGraphNode, graph, nullptr, 0));
REQUIRE(hipErrorInvalidValue ==
hipGraphEventRecordNodeGetEvent(EmptyGraphNode, &event_out));
HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(EmptyGraphNode, &event_out),
hipErrorInvalidValue);
}
SECTION("input node is memset node") {
constexpr size_t Nbytes = 1024;
char *A_d;
char* A_d;
hipGraphNode_t memset_A;
hipMemsetParams memsetParams{};
HIP_CHECK(hipMalloc(&A_d, Nbytes));
@@ -112,19 +113,21 @@ TEST_CASE("Unit_hipGraphEventRecordNodeGetEvent_Negative") {
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
&memsetParams));
REQUIRE(hipErrorInvalidValue ==
hipGraphEventRecordNodeGetEvent(memset_A, &event_out));
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(memset_A, &event_out), hipErrorInvalidValue);
HIP_CHECK(hipFree(A_d));
}
SECTION("input node is event wait node") {
HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(eventwait, &event_out), hipErrorInvalidValue);
}
SECTION("input node is uninitialized node") {
hipGraphNode_t node_unit{};
REQUIRE(hipErrorInvalidValue ==
hipGraphEventRecordNodeGetEvent(node_unit, &event_out));
HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(node_unit, &event_out), hipErrorInvalidValue);
}
HIP_CHECK(hipGraphDestroy(graph));
HIP_CHECK(hipEventDestroy(event));
HIP_CHECK(hipEventDestroy(event1));
HIP_CHECK(hipEventDestroy(event2));
}
+21 -35
파일 보기
@@ -30,14 +30,16 @@ Testcase Scenarios :
- Input event parameter is nullptr.
- Empty node is passed as input node.
- Memset node is passed as input node.
- Event wait node is passed as input node.
- Input node is an uninitialized node.
- Input event is an uninitialized event.
*/
#include <hip_test_common.hh>
#include <hip_test_checkers.hh>
#include <hip_test_common.hh>
#include <hip_test_kernels.hh>
/**
* Local Function: Set Get test
*/
@@ -49,8 +51,7 @@ static void validateEventRecordNodeSetEvent(unsigned flag) {
HIP_CHECK(hipEventCreate(&event1));
HIP_CHECK(hipEventCreateWithFlags(&event2, flag));
hipGraphNode_t eventrec;
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
// Set a different event
HIP_CHECK(hipGraphEventRecordNodeSetEvent(eventrec, event2));
HIP_CHECK(hipGraphEventRecordNodeGetEvent(eventrec, &event_out));
@@ -73,11 +74,9 @@ static void setEventWaitNode() {
HIP_CHECK(hipEventCreate(&event1));
HIP_CHECK(hipEventCreate(&event2));
hipGraphNode_t eventwait;
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1));
// Set a different event eventwait using hipGraphEventRecordNodeSetEvent
REQUIRE(hipErrorInvalidValue ==
hipGraphEventRecordNodeSetEvent(eventwait, event2));
HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(eventwait, event2), hipErrorInvalidValue);
// Free resources
HIP_CHECK(hipGraphDestroy(graph));
HIP_CHECK(hipEventDestroy(event1));
@@ -98,13 +97,11 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_SetEventProperty") {
HIP_CHECK(hipEventCreateWithFlags(&event2_end, hipEventDisableTiming));
// Create nodes
hipGraphNode_t event_start_rec, event_end_rec;
HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0,
event1_start));
HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0,
event1_end));
HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0, event1_start));
HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0, event1_end));
// Create memset node
constexpr size_t Nbytes = 1024;
char *A_d;
char* A_d;
hipGraphNode_t memset_A;
hipMemsetParams memsetParams{};
HIP_CHECK(hipMalloc(&A_d, Nbytes));
@@ -115,8 +112,7 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_SetEventProperty") {
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
&memsetParams));
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
// Create dependencies
// event_start_rec --> memset_A --> event_end_rec
HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec, &memset_A, 1));
@@ -132,8 +128,7 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_SetEventProperty") {
// Validate by measuring time difference between event_end_rec &
// event_start_rec
float t = 0.0f;
REQUIRE(hipSuccess == hipEventElapsedTime(&t, event1_start,
event1_end));
REQUIRE(hipSuccess == hipEventElapsedTime(&t, event1_start, event1_end));
REQUIRE(t > 0.0f);
// Change the event property after instantiation
HIP_CHECK(hipGraphEventRecordNodeSetEvent(event_start_rec, event2_start));
@@ -145,8 +140,7 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_SetEventProperty") {
// hipErrorInvalidHandle when events are created using
// hipEventDisableTiming flag.
t = 0.0f;
REQUIRE(hipErrorInvalidHandle ==
hipEventElapsedTime(&t, event2_start, event2_end));
HIP_CHECK_ERROR(hipEventElapsedTime(&t, event2_start, event2_end), hipErrorInvalidHandle);
// Free resources
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipStreamDestroy(streamForGraph));
@@ -185,28 +179,24 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_Negative") {
HIP_CHECK(hipEventCreate(&event1));
HIP_CHECK(hipEventCreate(&event2));
hipGraphNode_t eventrec;
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
SECTION("node = nullptr") {
REQUIRE(hipErrorInvalidValue == hipGraphEventRecordNodeSetEvent(nullptr,
event2));
HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(nullptr, event2), hipErrorInvalidValue);
}
SECTION("event_out = nullptr") {
REQUIRE(hipErrorInvalidValue == hipGraphEventRecordNodeSetEvent(eventrec,
nullptr));
HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(eventrec, nullptr), hipErrorInvalidValue);
}
SECTION("input node is empty node") {
hipGraphNode_t EmptyGraphNode;
HIP_CHECK(hipGraphAddEmptyNode(&EmptyGraphNode, graph, nullptr, 0));
REQUIRE(hipErrorInvalidValue ==
hipGraphEventRecordNodeSetEvent(EmptyGraphNode, event2));
HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(EmptyGraphNode, event2), hipErrorInvalidValue);
}
SECTION("input node is memset node") {
constexpr size_t Nbytes = 1024;
char *A_d;
char* A_d;
hipGraphNode_t memset_A;
hipMemsetParams memsetParams{};
HIP_CHECK(hipMalloc(&A_d, Nbytes));
@@ -217,10 +207,8 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_Negative") {
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
&memsetParams));
REQUIRE(hipErrorInvalidValue ==
hipGraphEventRecordNodeSetEvent(memset_A, event2));
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(memset_A, event2), hipErrorInvalidValue);
HIP_CHECK(hipFree(A_d));
}
@@ -230,14 +218,12 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_Negative") {
SECTION("input node is uninitialized node") {
hipGraphNode_t node_uninit{};
REQUIRE(hipErrorInvalidValue ==
hipGraphEventRecordNodeSetEvent(node_uninit, event2));
HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(node_uninit, event2), hipErrorInvalidValue);
}
SECTION("input event is uninitialized") {
hipEvent_t event_uninit{};
REQUIRE(hipErrorInvalidValue ==
hipGraphEventRecordNodeSetEvent(eventrec, event_uninit));
HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(eventrec, event_uninit), hipErrorInvalidValue);
}
HIP_CHECK(hipGraphDestroy(graph));
+26 -22
파일 보기
@@ -26,13 +26,15 @@ with the event set in hipGraphAddEventWaitNode.
- Output event parameter is passed as nullptr.
- Input node parameter is an empty node.
- Input node parameter is a memset node.
- Input node parameter is a event record node.
- Input node parameter is an uninitialized node.
*/
#include <hip_test_common.hh>
#include <hip_test_checkers.hh>
#include <hip_test_common.hh>
#include <hip_test_kernels.hh>
/**
* Local Function
*/
@@ -42,8 +44,7 @@ static void validateEventWaitNodeGetEvent(unsigned flag) {
hipEvent_t event, event_out;
HIP_CHECK(hipEventCreateWithFlags(&event, flag));
hipGraphNode_t eventwait;
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
event));
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event));
HIP_CHECK(hipGraphEventWaitNodeGetEvent(eventwait, &event_out));
// validate set event and get event are same
REQUIRE(event == event_out);
@@ -77,31 +78,32 @@ TEST_CASE("Unit_hipGraphEventWaitNodeGetEvent_Functional") {
TEST_CASE("Unit_hipGraphEventWaitNodeGetEvent_Negative") {
hipGraph_t graph;
HIP_CHECK(hipGraphCreate(&graph, 0));
hipEvent_t event, event_out;
HIP_CHECK(hipEventCreate(&event));
hipGraphNode_t eventwait;
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
event));
hipEvent_t event_out;
hipEvent_t event1, event2;
HIP_CHECK(hipEventCreate(&event1));
HIP_CHECK(hipEventCreate(&event2));
hipGraphNode_t eventrec, eventwait;
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event2));
SECTION("node = nullptr") {
REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeGetEvent(nullptr,
&event_out));
HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(nullptr, &event_out), hipErrorInvalidValue);
}
SECTION("event_out = nullptr") {
REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeGetEvent(eventwait,
nullptr));
HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(eventwait, nullptr), hipErrorInvalidValue);
}
SECTION("input node is empty node") {
hipGraphNode_t EmptyGraphNode;
HIP_CHECK(hipGraphAddEmptyNode(&EmptyGraphNode, graph, nullptr, 0));
REQUIRE(hipErrorInvalidValue ==
hipGraphEventWaitNodeGetEvent(EmptyGraphNode, &event_out));
HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(EmptyGraphNode, &event_out),
hipErrorInvalidValue);
}
SECTION("input node is memset node") {
constexpr size_t Nbytes = 1024;
char *A_d;
char* A_d;
hipGraphNode_t memset_A;
hipMemsetParams memsetParams{};
HIP_CHECK(hipMalloc(&A_d, Nbytes));
@@ -112,19 +114,21 @@ TEST_CASE("Unit_hipGraphEventWaitNodeGetEvent_Negative") {
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
&memsetParams));
REQUIRE(hipErrorInvalidValue ==
hipGraphEventWaitNodeGetEvent(memset_A, &event_out));
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(memset_A, &event_out), hipErrorInvalidValue);
HIP_CHECK(hipFree(A_d));
}
SECTION("input node is event record node") {
HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(eventrec, &event_out), hipErrorInvalidValue);
}
SECTION("input node is uninitialized") {
hipGraphNode_t node_uninit{};
REQUIRE(hipErrorInvalidValue ==
hipGraphEventWaitNodeGetEvent(node_uninit, &event_out));
HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(node_uninit, &event_out), hipErrorInvalidValue);
}
HIP_CHECK(hipGraphDestroy(graph));
HIP_CHECK(hipEventDestroy(event));
HIP_CHECK(hipEventDestroy(event1));
HIP_CHECK(hipEventDestroy(event2));
}
+41 -63
파일 보기
@@ -37,11 +37,10 @@ Testcase Scenarios :
- Input event is an uninitialized node.
*/
#include <hip_test_common.hh>
#include <hip_test_checkers.hh>
#include <hip_test_common.hh>
#include <hip_test_kernels.hh>
#define LEN 512
/**
* Local Function
@@ -54,8 +53,7 @@ static void validateEventWaitNodeSetEvent(unsigned flag) {
HIP_CHECK(hipEventCreate(&event1));
HIP_CHECK(hipEventCreateWithFlags(&event2, flag));
hipGraphNode_t eventwait;
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1));
// Set a different event
HIP_CHECK(hipGraphEventWaitNodeSetEvent(eventwait, event2));
HIP_CHECK(hipGraphEventWaitNodeGetEvent(eventwait, &event_out));
@@ -78,11 +76,9 @@ static void setEventRecordNode() {
HIP_CHECK(hipEventCreate(&event1));
HIP_CHECK(hipEventCreate(&event2));
hipGraphNode_t eventrec;
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
// Set a different event eventrec using hipGraphEventWaitNodeSetEvent
REQUIRE(hipErrorInvalidValue ==
hipGraphEventWaitNodeSetEvent(eventrec, event2));
HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(eventrec, event2), hipErrorInvalidValue);
// Free resources
HIP_CHECK(hipGraphDestroy(graph));
HIP_CHECK(hipEventDestroy(event1));
@@ -93,11 +89,12 @@ static void setEventRecordNode() {
* Scenario 2
*/
TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_SetProp") {
size_t memsize = LEN * sizeof(int);
constexpr size_t N = 512;
size_t memsize = N * sizeof(int);
constexpr auto blocksPerCU = 6; // to hide latency
constexpr auto threadsPerBlock = 256;
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, LEN);
size_t NElem{LEN};
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
size_t NElem{N};
hipGraph_t graph1, graph2;
hipStream_t streamForGraph1, streamForGraph2;
hipGraphExec_t graphExec1, graphExec2;
@@ -123,67 +120,56 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_SetProp") {
HIP_CHECK(hipMalloc(&out_d_g1, memsize));
HIP_CHECK(hipMalloc(&out_d_g2, memsize));
// Initialize host buffer
for (uint32_t i = 0; i < LEN; i++) {
for (uint32_t i = 0; i < N; i++) {
inp_h[i] = i;
out_h_g1[i] = 0;
out_h_g2[i] = 0;
}
// Graph1 creation ...........
// Create event1 record node in graph1
HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph1, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph1, nullptr, 0, event1));
// Create memcpy and kernel nodes for graph1
hipGraphNode_t memcpyH2D, memcpyD2H_1, kernelnode_1;
hipKernelNodeParams kernelNodeParams1{};
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d,
inp_h, memsize, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_1, graph1, nullptr, 0,
out_h_g1, out_d_g1, memsize, hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d, inp_h, memsize,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_1, graph1, nullptr, 0, out_h_g1, out_d_g1, memsize,
hipMemcpyDeviceToHost));
void* kernelArgs1[] = {&inp_d, &out_d_g1, reinterpret_cast<void *>(&NElem)};
kernelNodeParams1.func =
reinterpret_cast<void *>(HipTest::vector_square<int>);
void* kernelArgs1[] = {&inp_d, &out_d_g1, reinterpret_cast<void*>(&NElem)};
kernelNodeParams1.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
kernelNodeParams1.gridDim = dim3(blocks);
kernelNodeParams1.blockDim = dim3(threadsPerBlock);
kernelNodeParams1.sharedMemBytes = 0;
kernelNodeParams1.kernelParams = reinterpret_cast<void**>(kernelArgs1);
kernelNodeParams1.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph1, nullptr, 0,
&kernelNodeParams1));
HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph1, nullptr, 0, &kernelNodeParams1));
// Create dependencies for graph1
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D,
&event_rec_node, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &event_rec_node,
&kernelnode_1, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode_1,
&memcpyD2H_1, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D, &event_rec_node, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &event_rec_node, &kernelnode_1, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode_1, &memcpyD2H_1, 1));
// Graph2 creation ...........
// Create event1 record node in graph2
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0, event1));
// Create memcpy and kernel nodes for graph2
hipGraphNode_t memcpyD2H_2, kernelnode_2;
hipKernelNodeParams kernelNodeParams2{};
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_2, graph2, nullptr, 0,
out_h_g2, out_d_g2, memsize, hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_2, graph2, nullptr, 0, out_h_g2, out_d_g2, memsize,
hipMemcpyDeviceToHost));
void* kernelArgs2[] = {&inp_d, &out_d_g2, reinterpret_cast<void *>(&NElem)};
kernelNodeParams2.func =
reinterpret_cast<void *>(HipTest::vector_cubic<int>);
void* kernelArgs2[] = {&inp_d, &out_d_g2, reinterpret_cast<void*>(&NElem)};
kernelNodeParams2.func = reinterpret_cast<void*>(HipTest::vector_cubic<int>);
kernelNodeParams2.gridDim = dim3(blocks);
kernelNodeParams2.blockDim = dim3(threadsPerBlock);
kernelNodeParams2.sharedMemBytes = 0;
kernelNodeParams2.kernelParams = reinterpret_cast<void**>(kernelArgs2);
kernelNodeParams2.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph2, nullptr, 0,
&kernelNodeParams2));
HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph2, nullptr, 0, &kernelNodeParams2));
// Create dependencies for graph2
HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node,
&kernelnode_2, 1));
HIP_CHECK(hipGraphAddDependencies(graph2, &kernelnode_2,
&memcpyD2H_2, 1));
HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node, &kernelnode_2, 1));
HIP_CHECK(hipGraphAddDependencies(graph2, &kernelnode_2, &memcpyD2H_2, 1));
// Instantiate and launch the graphs
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
@@ -198,16 +184,16 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_SetProp") {
HIP_CHECK(hipStreamSynchronize(streamForGraph2));
// Validate output
bool btestPassed1 = true;
for (uint32_t i = 0; i < LEN; i++) {
if (out_h_g1[i] != (inp_h[i]*inp_h[i])) {
for (uint32_t i = 0; i < N; i++) {
if (out_h_g1[i] != (inp_h[i] * inp_h[i])) {
btestPassed1 = false;
break;
}
}
REQUIRE(btestPassed1 == true);
bool btestPassed2 = true;
for (uint32_t i = 0; i < LEN; i++) {
if (out_h_g2[i] != (inp_h[i]*inp_h[i]*inp_h[i])) {
for (uint32_t i = 0; i < N; i++) {
if (out_h_g2[i] != (inp_h[i] * inp_h[i] * inp_h[i])) {
btestPassed2 = false;
break;
}
@@ -256,28 +242,24 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_Negative") {
HIP_CHECK(hipEventCreate(&event1));
HIP_CHECK(hipEventCreate(&event2));
hipGraphNode_t eventwait;
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1));
SECTION("node = nullptr") {
REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeSetEvent(
nullptr, event2));
HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(nullptr, event2), hipErrorInvalidValue);
}
SECTION("event = nullptr") {
REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeSetEvent(
eventwait, nullptr));
HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(eventwait, nullptr), hipErrorInvalidValue);
}
SECTION("input node is empty node") {
hipGraphNode_t EmptyGraphNode;
HIP_CHECK(hipGraphAddEmptyNode(&EmptyGraphNode, graph, nullptr, 0));
REQUIRE(hipErrorInvalidValue ==
hipGraphEventWaitNodeSetEvent(EmptyGraphNode, event2));
HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(EmptyGraphNode, event2), hipErrorInvalidValue);
}
SECTION("input node is memset node") {
constexpr size_t Nbytes = 1024;
char *A_d;
char* A_d;
hipGraphNode_t memset_A;
hipMemsetParams memsetParams{};
HIP_CHECK(hipMalloc(&A_d, Nbytes));
@@ -288,10 +270,8 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_Negative") {
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
&memsetParams));
REQUIRE(hipErrorInvalidValue ==
hipGraphEventWaitNodeSetEvent(memset_A, event2));
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(memset_A, event2), hipErrorInvalidValue);
HIP_CHECK(hipFree(A_d));
}
@@ -301,14 +281,12 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_Negative") {
SECTION("input node is uninitialized node") {
hipGraphNode_t node_uninit{};
REQUIRE(hipErrorInvalidValue ==
hipGraphEventWaitNodeSetEvent(node_uninit, event2));
HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(node_uninit, event2), hipErrorInvalidValue);
}
SECTION("input event is uninitialized") {
hipEvent_t event_uninit{};
REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeSetEvent(
eventwait, event_uninit));
HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(eventwait, event_uninit), hipErrorInvalidValue);
}
HIP_CHECK(hipGraphDestroy(graph));
+38 -14
파일 보기
@@ -20,26 +20,51 @@ THE SOFTWARE.
#include <hip_test_common.hh>
/**
Negative Testcase Scenarios :
1) Pass hipGraphExecDestroy with nullptr.
2) Pass hipGraphExecDestroy with un-initilze structure.
3) Destroy graph before exec-graph destroyed and verify no crash occurs.
*/
* @addtogroup hipGraphExecDestroy hipGraphExecDestroy
* @{
* @ingroup GraphTest
* `hipGraphExecDestroy(hipGraphExec_t graphExec)` -
* Destroys an executable graph
*/
/**
* Test Description
* ------------------------
* - Test to verify API behavior with invalid arguments:
* -# GraphExec is nullptr
* -# GraphExec is uninitialized
* Test source
* ------------------------
* - unit/graph/hipGraphExecDestroy.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipGraphExecDestroy_Negative_Parameters") {
TEST_CASE("Unit_hipGraphExecDestroy_Negative") {
hipError_t ret;
SECTION("Pass hipGraphExecDestroy with nullptr") {
ret = hipGraphExecDestroy(nullptr);
REQUIRE(hipErrorInvalidValue == ret);
HIP_CHECK_ERROR(hipGraphExecDestroy(nullptr), hipErrorInvalidValue);
}
SECTION("Pass hipGraphExecDestroy with un-initilze structure") {
hipGraphExec_t graphExec{};
ret = hipGraphExecDestroy(graphExec);
REQUIRE(hipErrorInvalidValue == ret);
hipGraphExec_t graph_exec{};
HIP_CHECK_ERROR(hipGraphExecDestroy(graph_exec), hipErrorInvalidValue);
}
}
TEST_CASE("Unit_hipGraphExecDestroy_Sequence") {
/**
* Test Description
* ------------------------
* - Basic positive test for hipGraphExecDestroy
* - create an executable graph and then destroy it
* Test source
* ------------------------
* - unit/graph/hipGraphExecDestroy.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipGraphExecDestroy_Positive_Basic") {
hipGraph_t graph;
hipGraphExec_t graphExec;
hipStream_t streamForGraph;
@@ -70,4 +95,3 @@ TEST_CASE("Unit_hipGraphExecDestroy_Sequence") {
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipStreamDestroy(streamForGraph));
}
+101 -72
파일 보기
@@ -33,7 +33,12 @@ Testcase Scenarios :
the graph to create an executable graph. Change the event in the
executable graph to event2. Verify that the event record node still
contains event1.
3) Negative Scenarios
3) Scenario to verify that hipGraphExecEventRecordNodeSetEvent can set event
created on different device. Create an event record node with event1 and add it to graph.
Instantiate the graph to create an executable graph. Call the API to change the event in the
executable graph to event2 which has been created on different device. Verify that graph can be
launched and no error is reported.
4) Negative Scenarios
- Input executable graph is a nullptr.
- Input node is a nullptr.
- Input event to set is a nullptr.
@@ -45,27 +50,26 @@ Testcase Scenarios :
- Input node is a event wait node.
*/
#include <hip_test_common.hh>
#include <hip_test_checkers.hh>
#include <hip_test_common.hh>
#include <hip_test_kernels.hh>
#define GRID_DIM 512
#define BLK_DIM 512
#define LEN (GRID_DIM * BLK_DIM)
/**
* Kernel Functions to copy.
*/
static __global__ void copy_ker_func(int* a, int* b) {
int tx = blockIdx.x*blockDim.x + threadIdx.x;
if (tx < LEN) b[tx] = a[tx];
static __global__ void copy_ker_func(int* a, int* b, size_t N) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
if (tx < N) b[tx] = a[tx];
}
/**
* Scenario 1: Functional scenario (See description Above)
*/
TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") {
size_t memsize = LEN*sizeof(int);
constexpr size_t gridSize = 512;
constexpr size_t blockSize = 512;
constexpr size_t N = gridSize * blockSize;
size_t memsize = N * sizeof(int);
hipGraph_t graph;
HIP_CHECK(hipGraphCreate(&graph, 0));
// Create events
@@ -75,10 +79,8 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") {
HIP_CHECK(hipEventCreate(&event2_end));
// Create nodes with event_start and event1_end
hipGraphNode_t event_start_rec, event_end_rec;
HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0,
event_start));
HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0,
event1_end));
HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0, event_start));
HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0, event1_end));
int *inp_h, *inp_d, *out_h, *out_d;
// Allocate host buffers
inp_h = reinterpret_cast<int*>(malloc(memsize));
@@ -89,7 +91,7 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") {
HIP_CHECK(hipMalloc(&inp_d, memsize));
HIP_CHECK(hipMalloc(&out_d, memsize));
// Initialize host buffer
for (uint32_t i = 0; i < LEN; i++) {
for (uint32_t i = 0; i < N; i++) {
inp_h[i] = i;
out_h[i] = 0;
}
@@ -97,44 +99,39 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") {
// Create memcpy and kernel nodes for graph
hipGraphNode_t memcpyH2D, memcpyD2H, kernelnode;
hipKernelNodeParams kernelNodeParams{};
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr, 0, inp_d,
inp_h, memsize, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nullptr, 0,
out_h, out_d, memsize, hipMemcpyDeviceToHost));
void* kernelArgs1[] = {&inp_d, &out_d};
kernelNodeParams.func = reinterpret_cast<void *>(copy_ker_func);
kernelNodeParams.gridDim = dim3(GRID_DIM);
kernelNodeParams.blockDim = dim3(BLK_DIM);
size_t NElem{N};
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr, 0, inp_d, inp_h, memsize,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nullptr, 0, out_h, out_d, memsize,
hipMemcpyDeviceToHost));
void* kernelArgs1[] = {&inp_d, &out_d, reinterpret_cast<void*>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void*>(copy_ker_func);
kernelNodeParams.gridDim = dim3(gridSize);
kernelNodeParams.blockDim = dim3(blockSize);
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs1);
kernelNodeParams.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&kernelnode, graph, nullptr, 0,
&kernelNodeParams));
HIP_CHECK(hipGraphAddKernelNode(&kernelnode, graph, nullptr, 0, &kernelNodeParams));
// Create dependencies for graph
HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec,
&memcpyH2D, 1));
HIP_CHECK(hipGraphAddDependencies(graph, &memcpyH2D,
&kernelnode, 1));
HIP_CHECK(hipGraphAddDependencies(graph, &kernelnode,
&memcpyD2H, 1));
HIP_CHECK(hipGraphAddDependencies(graph, &memcpyD2H,
&event_end_rec, 1));
HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec, &memcpyH2D, 1));
HIP_CHECK(hipGraphAddDependencies(graph, &memcpyH2D, &kernelnode, 1));
HIP_CHECK(hipGraphAddDependencies(graph, &kernelnode, &memcpyD2H, 1));
HIP_CHECK(hipGraphAddDependencies(graph, &memcpyD2H, &event_end_rec, 1));
// Instantiate and launch the graph
hipStream_t streamForGraph;
hipGraphExec_t graphExec;
HIP_CHECK(hipStreamCreate(&streamForGraph));
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
// Change the event at event_end_rec node to event2_end
HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec,
event_end_rec, event2_end));
HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, event_end_rec, event2_end));
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
// Wait for graph to complete
HIP_CHECK(hipStreamSynchronize(streamForGraph));
// Validate output
bool btestPassed = true;
for (uint32_t i = 0; i < LEN; i++) {
for (uint32_t i = 0; i < N; i++) {
if (out_h[i] != inp_h[i]) {
btestPassed = false;
break;
@@ -147,8 +144,7 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") {
REQUIRE(t > 0.0f);
// Since event1_end is never recorded, hipEventElapsedTime
// should return error code.
REQUIRE(hipErrorInvalidResourceHandle ==
hipEventElapsedTime(&t, event_start, event1_end));
HIP_CHECK_ERROR(hipEventElapsedTime(&t, event_start, event1_end), hipErrorInvalidResourceHandle);
// Free resources
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipStreamDestroy(streamForGraph));
@@ -173,12 +169,10 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_VerifyEventNotChanged") {
HIP_CHECK(hipEventCreate(&event1));
HIP_CHECK(hipEventCreate(&event2));
hipGraphNode_t eventrec;
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
hipGraphExec_t graphExec;
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec,
eventrec, event2));
HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, event2));
HIP_CHECK(hipGraphEventRecordNodeGetEvent(eventrec, &event_out));
// validate set event and get event are same
REQUIRE(event1 == event_out);
@@ -190,7 +184,48 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_VerifyEventNotChanged") {
}
/**
* Scenario 3: Negative Tests
* Scenario 3: This test verifies event in node of the executable graph can be changed to event on
* different device
*/
TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Positive_DifferentDevices") {
const auto device_count = HipTest::getDeviceCount();
if (device_count < 2) {
HipTest::HIP_SKIP_TEST("Skipping because devices < 2");
return;
}
hipGraphExec_t graphExec;
hipStream_t streamForGraph;
hipGraph_t graph;
hipEvent_t event1, event2;
HIP_CHECK(hipSetDevice(0));
HIP_CHECK(hipEventCreate(&event1));
HIP_CHECK(hipSetDevice(1));
HIP_CHECK(hipEventCreate(&event2));
HIP_CHECK(hipSetDevice(0));
hipGraphNode_t eventrec;
HIP_CHECK(hipGraphCreate(&graph, 0));
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
// Verify event on different device can be set in graphExec
// Instantiate and launch the graph
HIP_CHECK(hipStreamCreate(&streamForGraph));
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, event2));
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
// Wait for graph to complete
HIP_CHECK(hipStreamSynchronize(streamForGraph));
// Free resources
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipStreamDestroy(streamForGraph));
HIP_CHECK(hipGraphDestroy(graph));
HIP_CHECK(hipEventDestroy(event2));
HIP_CHECK(hipEventDestroy(event1))
}
/**
* Scenario 4: Negative Parameter Tests
*/
TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Negative") {
hipGraph_t graph;
@@ -199,11 +234,10 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Negative") {
HIP_CHECK(hipEventCreate(&event1));
HIP_CHECK(hipEventCreate(&event2));
hipGraphNode_t eventrec;
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
// Create memset
constexpr size_t Nbytes = 1024;
char *A_d;
char* A_d;
hipGraphNode_t memset_A;
hipMemsetParams memsetParams{};
HIP_CHECK(hipMalloc(&A_d, Nbytes));
@@ -219,66 +253,61 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Negative") {
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
SECTION("hGraphExec = nullptr") {
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventRecordNodeSetEvent(nullptr, eventrec, event2));
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(nullptr, eventrec, event2),
hipErrorInvalidValue);
}
SECTION("hNode = nullptr") {
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventRecordNodeSetEvent(graphExec, nullptr, event2));
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, nullptr, event2),
hipErrorInvalidValue);
}
SECTION("event = nullptr") {
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, nullptr));
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, nullptr),
hipErrorInvalidValue);
}
SECTION("hGraphExec is uninitialized") {
hipGraphExec_t graphExec1{};
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventRecordNodeSetEvent(graphExec1, eventrec, event2));
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec1, eventrec, event2),
hipErrorInvalidValue);
}
SECTION("hNode is uninitialized") {
hipGraphNode_t dummy{};
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventRecordNodeSetEvent(graphExec, dummy, event2));
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, dummy, event2),
hipErrorInvalidValue);
}
SECTION("event is uninitialized") {
hipEvent_t event_dummy{};
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec,
event_dummy));
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, event_dummy),
hipErrorInvalidValue);
}
SECTION("event record node does not exist") {
hipGraph_t graph1;
HIP_CHECK(hipGraphCreate(&graph1, 0));
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph1, nullptr, 0,
&memsetParams));
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph1, nullptr, 0, &memsetParams));
hipGraphExec_t graphExec1;
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventRecordNodeSetEvent(graphExec1, eventrec, event2));
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec1, eventrec, event2),
hipErrorInvalidValue);
HIP_CHECK(hipGraphExecDestroy(graphExec1));
HIP_CHECK(hipGraphDestroy(graph1));
}
SECTION("pass memset node as hNode") {
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
&memsetParams));
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventRecordNodeSetEvent(graphExec, memset_A, event2));
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, memset_A, event2),
hipErrorInvalidValue);
}
SECTION("pass event wait node as hNode") {
hipGraphNode_t event_wait_node;
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0,
event1));
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventRecordNodeSetEvent(graphExec, event_wait_node,
event2));
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0, event1));
HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, event_wait_node, event2),
hipErrorInvalidValue);
}
HIP_CHECK(hipFree(A_d));
+68 -80
파일 보기
@@ -47,33 +47,30 @@ Testcase Scenarios :
- Pass event record node as input node.
*/
#include <hip_test_common.hh>
#include <hip_test_checkers.hh>
#include <hip_test_common.hh>
#include <hip_test_kernels.hh>
#define GRID_DIM 64
#define BLK_DIM 256
#define LEN (GRID_DIM * BLK_DIM)
#define DELAY_IN_MS 2000
/**
* Kernel Functions to perform square and introduce delay in device.
*/
static __global__ void sqr_ker_func(int* a, int* b, int clockrate) {
int tx = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
if (tx < LEN) b[tx] = a[tx]*a[tx];
uint64_t wait_t = DELAY_IN_MS,
start = clock64()/clockrate, cur;
do { cur = clock64()/clockrate - start;}while (cur < wait_t);
static __global__ void sqr_ker_func(int* a, int* b, size_t N, int clockrate, size_t delayMs) {
int tx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
if (tx < N) b[tx] = a[tx] * a[tx];
uint64_t wait_t = delayMs, start = clock64() / clockrate, cur;
do {
cur = clock64() / clockrate - start;
} while (cur < wait_t);
}
static __global__ void sqr_ker_func_gfx11(int* a, int* b, int clockrate) {
static __global__ void sqr_ker_func_gfx11(int* a, int* b, size_t N, int clockrate, size_t delayMs) {
#if HT_AMD
int tx = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x;
if (tx < LEN) b[tx] = a[tx]*a[tx];
uint64_t wait_t = DELAY_IN_MS,
start = wall_clock64()/clockrate, cur;
do { cur = wall_clock64()/clockrate - start;}while (cur < wait_t);
int tx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
if (tx < N) b[tx] = a[tx] * a[tx];
uint64_t wait_t = delayMs, start = wall_clock64() / clockrate, cur;
do {
cur = wall_clock64() / clockrate - start;
} while (cur < wait_t);
#endif
}
@@ -81,7 +78,10 @@ static __global__ void sqr_ker_func_gfx11(int* a, int* b, int clockrate) {
* Scenario 1: Test to validate setting different events in executable graph.
*/
TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
size_t memsize = LEN*sizeof(int);
constexpr size_t gridSize = 64;
constexpr size_t blockSize = 256;
constexpr size_t N = gridSize * blockSize;
size_t memsize = N * sizeof(int);
hipGraph_t graph1, graph2;
HIP_CHECK(hipGraphCreate(&graph1, 0));
HIP_CHECK(hipGraphCreate(&graph2, 0));
@@ -91,8 +91,7 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
HIP_CHECK(hipEventCreate(&event2));
// Create nodes with event_start and event1_end
hipGraphNode_t event_rec;
HIP_CHECK(hipGraphAddEventRecordNode(&event_rec, graph1, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddEventRecordNode(&event_rec, graph1, nullptr, 0, event1));
int *inp_h, *inp_d, *out_h, *out_d;
// Allocate host buffers
inp_h = reinterpret_cast<int*>(malloc(memsize));
@@ -103,7 +102,7 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
HIP_CHECK(hipMalloc(&inp_d, memsize));
HIP_CHECK(hipMalloc(&out_d, memsize));
// Initialize host buffer
for (uint32_t i = 0; i < LEN; i++) {
for (uint32_t i = 0; i < N; i++) {
inp_h[i] = i;
out_h[i] = 0;
}
@@ -112,10 +111,12 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
// MemcpyH2D -> kernel1 -> event_rec
hipGraphNode_t memcpyH2D, kernelnode1;
hipKernelNodeParams kernelNodeParams1{};
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d,
inp_h, memsize, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d, inp_h, memsize,
hipMemcpyHostToDevice));
// Get device clock rate
int clkRate = 0;
size_t NElem{N};
size_t delayMs{2000};
if (IsGfx11()) {
HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0));
} else {
@@ -123,29 +124,25 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
}
// kernel1
auto sqr_ker_func_used = IsGfx11() ? sqr_ker_func_gfx11 : sqr_ker_func;
void* kernelArgs[] = {&inp_d, &out_d, reinterpret_cast<void *>(&clkRate)};
kernelNodeParams1.func = reinterpret_cast<void *>(sqr_ker_func_used);
kernelNodeParams1.gridDim = dim3(GRID_DIM);
kernelNodeParams1.blockDim = dim3(BLK_DIM);
void* kernelArgs[] = {&inp_d, &out_d, reinterpret_cast<void*>(&NElem),
reinterpret_cast<void*>(&clkRate), reinterpret_cast<void*>(&delayMs)};
kernelNodeParams1.func = reinterpret_cast<void*>(sqr_ker_func_used);
kernelNodeParams1.gridDim = dim3(gridSize);
kernelNodeParams1.blockDim = dim3(blockSize);
kernelNodeParams1.sharedMemBytes = 0;
kernelNodeParams1.kernelParams = reinterpret_cast<void**>(kernelArgs);
kernelNodeParams1.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&kernelnode1, graph1, nullptr, 0,
&kernelNodeParams1));
HIP_CHECK(hipGraphAddKernelNode(&kernelnode1, graph1, nullptr, 0, &kernelNodeParams1));
// Create dependencies for graph1
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D,
&kernelnode1, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode1,
&event_rec, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D, &kernelnode1, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode1, &event_rec, 1));
// graph2 creation ...........
// waitnode(event1) -> MemcpyD2H
hipGraphNode_t event_wait_node, memcpyD2H;
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph2, nullptr, 0,
out_h, out_d, memsize, hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node,
&memcpyD2H, 1));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph2, nullptr, 0, out_h, out_d, memsize,
hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0, event1));
HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node, &memcpyD2H, 1));
// Instantiate graph1 and graph2
hipStream_t streamForGraph1, streamForGraph2;
hipGraphExec_t graphExec1, graphExec2;
@@ -160,8 +157,8 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
HIP_CHECK(hipStreamSynchronize(streamForGraph2));
// Validate output
bool btestPassed = true;
for (uint32_t i = 0; i < LEN; i++) {
if (out_h[i] != (inp_h[i]*inp_h[i])) {
for (uint32_t i = 0; i < N; i++) {
if (out_h[i] != (inp_h[i] * inp_h[i])) {
btestPassed = false;
break;
}
@@ -170,10 +167,8 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
// hipGraphExecEventWaitNodeSetEvent() TEST
// Change the event at event_wait_node node to event2 and
// the event at event_rec node to event2.
HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec1,
event_rec, event2));
HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec2,
event_wait_node, event2));
HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec1, event_rec, event2));
HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec2, event_wait_node, event2));
// Launch graph1 and graph2
HIP_CHECK(hipGraphLaunch(graphExec1, streamForGraph1));
HIP_CHECK(hipGraphLaunch(graphExec2, streamForGraph2));
@@ -181,8 +176,8 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") {
HIP_CHECK(hipStreamSynchronize(streamForGraph2));
// Validate output
btestPassed = true;
for (uint32_t i = 0; i < LEN; i++) {
if (out_h[i] != (inp_h[i]*inp_h[i])) {
for (uint32_t i = 0; i < N; i++) {
if (out_h[i] != (inp_h[i] * inp_h[i])) {
btestPassed = false;
break;
}
@@ -214,12 +209,10 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_VerifyEventNotChanged") {
HIP_CHECK(hipEventCreate(&event1));
HIP_CHECK(hipEventCreate(&event2));
hipGraphNode_t eventwait;
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1));
hipGraphExec_t graphExec;
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec,
eventwait, event2));
HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, event2));
HIP_CHECK(hipGraphEventWaitNodeGetEvent(eventwait, &event_out));
// validate set event and get event are same
REQUIRE(event1 == event_out);
@@ -240,13 +233,11 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_Negative") {
HIP_CHECK(hipEventCreate(&event1));
HIP_CHECK(hipEventCreate(&event2));
hipGraphNode_t eventrec, eventwait;
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0,
event1));
HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1));
HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1));
// Create memset
constexpr size_t Nbytes = 1024;
char *A_d;
char* A_d;
hipGraphNode_t memset_A;
hipMemsetParams memsetParams{};
HIP_CHECK(hipMalloc(&A_d, Nbytes));
@@ -262,62 +253,59 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_Negative") {
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
SECTION("hGraphExec = nullptr") {
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventWaitNodeSetEvent(nullptr, eventwait, event2));
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(nullptr, eventwait, event2),
hipErrorInvalidValue);
}
SECTION("hNode = nullptr") {
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventWaitNodeSetEvent(graphExec, nullptr, event2));
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, nullptr, event2),
hipErrorInvalidValue);
}
SECTION("event = nullptr") {
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, nullptr));
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, nullptr),
hipErrorInvalidValue);
}
SECTION("hGraphExec is uninitialized") {
hipGraphExec_t graphExec1{};
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventWaitNodeSetEvent(graphExec1, eventwait, event2));
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec1, eventwait, event2),
hipErrorInvalidValue);
}
SECTION("hNode is uninitialized") {
hipGraphNode_t dummy{};
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventWaitNodeSetEvent(graphExec, dummy, event2));
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, dummy, event2),
hipErrorInvalidValue);
}
SECTION("event is uninitialized") {
hipEvent_t event_dummy{};
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait,
event_dummy));
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, event_dummy),
hipErrorInvalidValue);
}
SECTION("event wait node does not exist") {
hipGraph_t graph1;
HIP_CHECK(hipGraphCreate(&graph1, 0));
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph1, nullptr, 0,
&memsetParams));
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph1, nullptr, 0, &memsetParams));
hipGraphExec_t graphExec1;
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventWaitNodeSetEvent(graphExec1, eventwait, event2));
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec1, eventwait, event2),
hipErrorInvalidValue);
HIP_CHECK(hipGraphExecDestroy(graphExec1));
HIP_CHECK(hipGraphDestroy(graph1));
}
SECTION("pass memset node as hNode") {
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0,
&memsetParams));
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventWaitNodeSetEvent(graphExec, memset_A, event2));
HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams));
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, memset_A, event2),
hipErrorInvalidValue);
}
SECTION("pass event record node as hNode") {
REQUIRE(hipErrorInvalidValue ==
hipGraphExecEventWaitNodeSetEvent(graphExec, eventrec, event2));
HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, eventrec, event2),
hipErrorInvalidValue);
}
HIP_CHECK(hipFree(A_d));
+139 -186
파일 보기
@@ -27,22 +27,6 @@ THE SOFTWARE.
* and perform the update if possible.
*/
/**
Testcase Scenarios :
Functional-
1) Make a clone of the created graph and update the executable-graph from a clone or same graph again.
2) Update the executable-graph from a graph and make sure they are taking effect.
Negative-
1) When Pass hGraphExec as nullptr and verify api returns error code.
2) When Pass hGraph as nullptr and verify api returns error code.
3) When Pass hErrorNode_out as nullptr and verify api returns error code.
4) When Pass updateResult_out as nullptr and verify api returns error code.
5) When the a graphExec was updated with with different type of node and verify api returns error code.
6) When a node is deleted in hGraph but not its pair from hGraphExec and verify api returns error code.
7) When a node is deleted in hGraphExec but not its pair from hGraph and verify api returns error code.
8) When grpah dependencies differ but graph have same node and verify api returns error code.
*/
#include <hip_test_common.hh>
#include <hip_test_checkers.hh>
#include <hip_test_kernels.hh>
@@ -65,13 +49,11 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Basic") {
hipGraphNode_t hErrorNode_out{};
hipGraphExecUpdateResult updateResult_out{};
SECTION("Pass hGraphExec as nullptr") {
ret = hipGraphExecUpdate(nullptr, graph, &hErrorNode_out,
&updateResult_out);
ret = hipGraphExecUpdate(nullptr, graph, &hErrorNode_out, &updateResult_out);
REQUIRE(hipErrorInvalidValue == ret);
}
SECTION("Pass hGraph as nullptr") {
ret = hipGraphExecUpdate(graphExec, nullptr, &hErrorNode_out,
&updateResult_out);
ret = hipGraphExecUpdate(graphExec, nullptr, &hErrorNode_out, &updateResult_out);
REQUIRE(hipErrorInvalidValue == ret);
}
SECTION("Pass hErrorNode_out as nullptr") {
@@ -101,10 +83,9 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_TypeChange") {
constexpr size_t N = 1024;
constexpr size_t Nbytes = N * sizeof(char);
constexpr size_t val = 0;
char *devData;
char* devData;
int *A_d, *A_h;
HipTest::initArrays<int>(&A_d, nullptr, nullptr,
&A_h, nullptr, nullptr, N, false);
HipTest::initArrays<int>(&A_d, nullptr, nullptr, &A_h, nullptr, nullptr, N, false);
HIP_CHECK(hipMalloc(&devData, Nbytes));
hipGraph_t graph, graph2;
hipGraphExec_t graphExec;
@@ -122,18 +103,16 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_TypeChange") {
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
&memsetParams));
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0, &memsetParams));
std::vector<hipGraphNode_t> dependencies;
dependencies.push_back(memsetNode);
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphCreate(&graph2, 0));
HIP_CHECK(hipStreamCreate(&streamForGraph));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
// graphExec was created before memcpyTemp was added to graph.
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
&updateResult_out);
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out);
REQUIRE(hipGraphExecUpdateErrorNodeTypeChanged == updateResult_out);
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
HIP_CHECK(hipFree(devData));
@@ -164,7 +143,7 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_CountDiffer") {
int *A_d, *B_d, *C_d;
int *A_h, *B_h, *C_h;
size_t NElem{N};
int *hData = reinterpret_cast<int*>(malloc(Nbytes));
int* hData = reinterpret_cast<int*>(malloc(Nbytes));
REQUIRE(hData != nullptr);
memset(hData, 0, Nbytes);
hipGraphNode_t memcpy_A, memcpy_B, memcpy_C, memcpyTemp;
@@ -180,57 +159,52 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_CountDiffer") {
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
HIP_CHECK(hipGraphCreate(&graph1, 0));
HIP_CHECK(hipStreamCreate(&streamForGraph));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d,
Nbytes, hipMemcpyDeviceToHost));
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, Nbytes,
hipMemcpyDeviceToHost));
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
kernelNodeParams.gridDim = dim3(blocks);
kernelNodeParams.blockDim = dim3(threadsPerBlock);
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
kernelNodeParams.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph1, nullptr, 0,
&kernelNodeParams));
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph1, nullptr, 0, &kernelNodeParams));
// Create dependencies
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &kernel_vecAdd, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &kernel_vecAdd, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &kernel_vecAdd, &memcpy_C, 1));
// Create a cloned graph and added extra node to it
HIP_CHECK(hipGraphClone(&graph2, graph1));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyTemp, graph2, nullptr, 0,
C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyTemp, graph2, nullptr, 0, C_h, C_d, Nbytes,
hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0));
SECTION("When a node deleted from Graph but not from its pair GraphExec") {
ret = hipGraphExecUpdate(graphExec2, graph1, &hErrorNode_out,
&updateResult_out);
ret = hipGraphExecUpdate(graphExec2, graph1, &hErrorNode_out, &updateResult_out);
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
}
SECTION("When a node deleted from GraphExec but not from its pair Graph") {
ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out,
&updateResult_out);
ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out, &updateResult_out);
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
}
SECTION("When the dependent nodes of a pair differ") {
HIP_CHECK(hipGraphCreate(&graph3, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph3, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_d, B_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph3, nullptr, 0, C_h, C_d,
Nbytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph3, nullptr, 0,
&kernelNodeParams));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph3, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_d, B_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph3, nullptr, 0, C_h, C_d, Nbytes,
hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph3, nullptr, 0, &kernelNodeParams));
// Create dependencies
HIP_CHECK(hipGraphAddDependencies(graph3, &memcpy_A, &kernel_vecAdd, 1));
HIP_CHECK(hipGraphAddDependencies(graph3, &memcpy_B, &kernel_vecAdd, 1));
HIP_CHECK(hipGraphAddDependencies(graph3, &memcpy_C, &kernel_vecAdd, 1));
ret = hipGraphExecUpdate(graphExec1, graph3, &hErrorNode_out,
&updateResult_out);
ret = hipGraphExecUpdate(graphExec1, graph3, &hErrorNode_out, &updateResult_out);
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
HIP_CHECK(hipGraphDestroy(graph3));
}
@@ -265,7 +239,7 @@ TEST_CASE("Unit_hipGraphExecUpdate_Functional") {
int *A_d, *B_d, *C_d;
int *A_h, *B_h, *C_h;
size_t NElem{N};
int *hData = reinterpret_cast<int*>(malloc(Nbytes));
int* hData = reinterpret_cast<int*>(malloc(Nbytes));
REQUIRE(hData != nullptr);
memset(hData, 0, Nbytes);
hipGraphNode_t memcpy_A, memcpy_B, memcpy_C;
@@ -280,22 +254,20 @@ TEST_CASE("Unit_hipGraphExecUpdate_Functional") {
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
HIP_CHECK(hipGraphCreate(&graph, 0));
HIP_CHECK(hipStreamCreate(&streamForGraph));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph, nullptr, 0, B_d, B_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph, nullptr, 0, C_h, C_d,
Nbytes, hipMemcpyDeviceToHost));
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
kernelNodeParams.func =
reinterpret_cast<void *>(HipTest::vector_square<int>);
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph, nullptr, 0, B_d, B_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph, nullptr, 0, C_h, C_d, Nbytes,
hipMemcpyDeviceToHost));
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
kernelNodeParams.gridDim = dim3(blocks);
kernelNodeParams.blockDim = dim3(threadsPerBlock);
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
kernelNodeParams.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSquare, graph, nullptr, 0,
&kernelNodeParams));
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSquare, graph, nullptr, 0, &kernelNodeParams));
// Create dependencies
HIP_CHECK(hipGraphAddDependencies(graph, &memcpy_A, &kernel_vecSquare, 1));
HIP_CHECK(hipGraphAddDependencies(graph, &memcpy_B, &kernel_vecSquare, 1));
@@ -304,36 +276,32 @@ TEST_CASE("Unit_hipGraphExecUpdate_Functional") {
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
SECTION("Update graphExec with clone graph") {
HIP_CHECK(hipGraphClone(&clonedgraph, graph));
HIP_CHECK(hipGraphExecUpdate(graphExec, clonedgraph, &hErrorNode_out,
&updateResult_out));
HIP_CHECK(hipGraphExecUpdate(graphExec, clonedgraph, &hErrorNode_out, &updateResult_out));
}
// Code for new graph creation with samilar node setup
HIP_CHECK(hipGraphCreate(&graph2, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d,
Nbytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphMemcpyNodeSetParams1D(memcpy_C, hData, C_d, Nbytes,
hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, Nbytes,
hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphMemcpyNodeSetParams1D(memcpy_C, hData, C_d, Nbytes, hipMemcpyDeviceToHost));
memset(&kernelNodeParams, 0, sizeof(hipKernelNodeParams));
void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
kernelNodeParams.gridDim = dim3(blocks);
kernelNodeParams.blockDim = dim3(threadsPerBlock);
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs2);
kernelNodeParams.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph2, nullptr, 0,
&kernelNodeParams));
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph2, nullptr, 0, &kernelNodeParams));
// Create dependencies
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &kernel_vecAdd, 1));
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B, &kernel_vecAdd, 1));
HIP_CHECK(hipGraphAddDependencies(graph2, &kernel_vecAdd, &memcpy_C, 1));
// Update the graphExec graph from graph -> graph2
HIP_CHECK(hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
&updateResult_out));
HIP_CHECK(hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out));
REQUIRE(updateResult_out == hipGraphExecUpdateSuccess);
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
HIP_CHECK(hipStreamSynchronize(streamForGraph));
@@ -380,24 +348,22 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_ParametersChanged") {
hipGraphExecUpdateResult updateResult_out;
HipTest::initArrays<int>(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
HIP_CHECK(hipGraphCreate(&graph1, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
SECTION("Update graphExec with similar graph and verify") {
HIP_CHECK(hipGraphCreate(&graph2, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h,
Nbytes, hipMemcpyHostToDevice));
ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out,
&updateResult_out);
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes,
hipMemcpyHostToDevice));
ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out, &updateResult_out);
REQUIRE(hipSuccess == ret);
HIP_CHECK(hipGraphDestroy(graph2));
}
SECTION("Update graphExec with similar graph and verify") {
HIP_CHECK(hipGraphCreate(&graph3, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_h, B_d,
Nbytes, hipMemcpyDeviceToHost));
ret = hipGraphExecUpdate(graphExec1, graph3, &hErrorNode_out,
&updateResult_out);
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_h, B_d, Nbytes,
hipMemcpyDeviceToHost));
ret = hipGraphExecUpdate(graphExec1, graph3, &hErrorNode_out, &updateResult_out);
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
REQUIRE(hipGraphExecUpdateErrorParametersChanged == updateResult_out);
@@ -437,16 +403,15 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_CountDiffer_1") {
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
HIP_CHECK(hipGraphCreate(&graph1, 0));
HIP_CHECK(hipGraphCreate(&graph2, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
// When count of nodes directly differ in graphExec1 and graph2
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d,
Nbytes, hipMemcpyDeviceToHost));
ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out,
&updateResult_out);
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, Nbytes,
hipMemcpyDeviceToHost));
ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out, &updateResult_out);
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
#if HT_NVIDIA
@@ -495,16 +460,15 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_CountDiffer_2") {
hipGraphExecUpdateResult updateResult_out;
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
HIP_CHECK(hipGraphCreate(&graph1, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0));
// Delete a node from the graph
HIP_CHECK(hipGraphDestroyNode(memcpy_B));
SECTION("When a node deleted from Graph but not from its pair GraphExec") {
ret = hipGraphExecUpdate(graphExec1, graph1, &hErrorNode_out,
&updateResult_out);
ret = hipGraphExecUpdate(graphExec1, graph1, &hErrorNode_out, &updateResult_out);
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
REQUIRE(hipGraphExecUpdateErrorTopologyChanged == updateResult_out);
#if HT_NVIDIA
@@ -513,11 +477,10 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_CountDiffer_2") {
}
SECTION("Update the GraphExec with similar graph where a node get deleted") {
HIP_CHECK(hipGraphCreate(&graph2, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_d, C_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_d, C_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0));
ret = hipGraphExecUpdate(graphExec2, graph1, &hErrorNode_out,
&updateResult_out);
ret = hipGraphExecUpdate(graphExec2, graph1, &hErrorNode_out, &updateResult_out);
#if HT_NVIDIA
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
REQUIRE(hipGraphExecUpdateErrorNotSupported == updateResult_out);
@@ -529,13 +492,12 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_CountDiffer_2") {
}
SECTION("When A node is deleted in GraphExec but not its pair from Graph") {
HIP_CHECK(hipGraphCreate(&graph3, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph3, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph3, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphInstantiate(&graphExec3, graph3, nullptr, nullptr, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_d, B_h,
Nbytes, hipMemcpyHostToDevice));
ret = hipGraphExecUpdate(graphExec3, graph3, &hErrorNode_out,
&updateResult_out);
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_d, B_h, Nbytes,
hipMemcpyHostToDevice));
ret = hipGraphExecUpdate(graphExec3, graph3, &hErrorNode_out, &updateResult_out);
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
#if HT_NVIDIA
REQUIRE(hipGraphExecUpdateErrorNotSupported == updateResult_out);
@@ -581,27 +543,26 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Dependent_NodesDiffer") {
hipGraphExecUpdateResult updateResult_out;
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
HIP_CHECK(hipGraphCreate(&graph1, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_d, C_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_d, C_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &memcpy_B, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &memcpy_C, 1));
HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0));
HIP_CHECK(hipGraphCreate(&graph2, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_d, C_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_d, C_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &memcpy_C, 1));
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B, &memcpy_C, 1));
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
&updateResult_out);
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out);
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
REQUIRE(hipGraphExecUpdateErrorTopologyChanged == updateResult_out);
@@ -642,10 +603,10 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_NodeType_Changed") {
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
HIP_CHECK(hipGraphCreate(&graph1, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &memcpy_B, 1));
HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0));
@@ -658,13 +619,11 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_NodeType_Changed") {
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph2, nullptr, 0,
&memsetParams));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph2, nullptr, 0, &memsetParams));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &memsetNode, 1));
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
&updateResult_out);
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out);
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
#if HT_NVIDIA
REQUIRE(hipGraphExecUpdateErrorTopologyChanged == updateResult_out);
@@ -726,22 +685,21 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_MultiDevice_Context_Changed") {
hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
HIP_CHECK(hipGraphCreate(&graph1, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d,
Nbytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, Nbytes,
hipMemcpyDeviceToHost));
hipKernelNodeParams kernelNodeParams{};
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
kernelNodeParams.gridDim = dim3(blocks);
kernelNodeParams.blockDim = dim3(threadsPerBlock);
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
kernelNodeParams.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0,
&kernelNodeParams));
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0, &kernelNodeParams));
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &kernel_vecADD, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &kernel_vecADD, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &kernel_vecADD, &memcpy_C, 1));
@@ -750,27 +708,25 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_MultiDevice_Context_Changed") {
HIP_CHECK(hipSetDevice(1));
HIP_CHECK(hipGraphCreate(&graph2, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d,
Nbytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, Nbytes,
hipMemcpyDeviceToHost));
memset(&kernelNodeParams, 0x00, sizeof(hipKernelNodeParams));
void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorSUB<int>);
void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
kernelNodeParams.gridDim = dim3(blocks);
kernelNodeParams.blockDim = dim3(threadsPerBlock);
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs1);
kernelNodeParams.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0,
&kernelNodeParams));
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0, &kernelNodeParams));
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &kernel_vecSUB, 1));
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B, &kernel_vecSUB, 1));
HIP_CHECK(hipGraphAddDependencies(graph2, &kernel_vecSUB, &memcpy_C, 1));
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
&updateResult_out);
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out);
REQUIRE(hipErrorGraphExecUpdateFailure == ret);
REQUIRE(hipGraphExecUpdateErrorUnsupportedFunctionChange == updateResult_out);
@@ -819,49 +775,46 @@ TEST_CASE("Unit_hipGraphExecUpdate_Functional_KernelFunction_Changed") {
hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
HIP_CHECK(hipGraphCreate(&graph1, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d,
Nbytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, Nbytes,
hipMemcpyDeviceToHost));
hipKernelNodeParams kernelNodeParams{};
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorADD<int>);
void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorADD<int>);
kernelNodeParams.gridDim = dim3(blocks);
kernelNodeParams.blockDim = dim3(threadsPerBlock);
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
kernelNodeParams.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0,
&kernelNodeParams));
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0, &kernelNodeParams));
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &kernel_vecADD, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &kernel_vecADD, 1));
HIP_CHECK(hipGraphAddDependencies(graph1, &kernel_vecADD, &memcpy_C, 1));
HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0));
HIP_CHECK(hipGraphCreate(&graph2, 0));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h,
Nbytes, hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d,
Nbytes, hipMemcpyDeviceToHost));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes,
hipMemcpyHostToDevice));
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, Nbytes,
hipMemcpyDeviceToHost));
memset(&kernelNodeParams, 0x00, sizeof(hipKernelNodeParams));
void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast<void *>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void *>(HipTest::vectorSUB<int>);
void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast<void*>(&NElem)};
kernelNodeParams.func = reinterpret_cast<void*>(HipTest::vectorSUB<int>);
kernelNodeParams.gridDim = dim3(blocks);
kernelNodeParams.blockDim = dim3(threadsPerBlock);
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs1);
kernelNodeParams.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0,
&kernelNodeParams));
HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0, &kernelNodeParams));
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &kernel_vecSUB, 1));
HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B, &kernel_vecSUB, 1));
HIP_CHECK(hipGraphAddDependencies(graph2, &kernel_vecSUB, &memcpy_C, 1));
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out,
&updateResult_out);
ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out);
REQUIRE(hipSuccess == ret);
HIP_CHECK(hipGraphLaunch(graphExec, stream));
HIP_CHECK(hipStreamSynchronize(stream));
+113 -380
파일 보기
@@ -19,394 +19,127 @@ THE SOFTWARE.
#include <hip_test_common.hh>
#include <hip_test_checkers.hh>
#include <hip_test_kernels.hh>
/* Test verifies hipGraphLaunch API
Negative scenarios -
1) Pass graphExec as nullptr and verify api returns error code.
2) Pass pGraphExec as nullptr and stream as hipStreamPerThread and verify api returns error code.
3) Pass pGraphExec as empty object and verify api returns error code.
4) Destroy executable graph and try to launch it. Make sure api should not crash and it should returns error code.
5) Destroy stream and try to launch respective executable graph. Make sure api should not crash and it should returns error code.
6) Destroy actual graph created and try to launch respective executable graph.
Check api should execute properly without crash or error code.
Functional Scenario -
1) Check basic functionality with stream as hipStreamPerThread
2) Test hipGraphLaunch call on multiple devices.
3) Create a graph with multiple nodes. Create an executable graph.
Launch the executable graph 3 times in stream simultaneously.
Wait for stream. Validate the output. No issues should be observed
4) Create a graph with multiple nodes. Create an executable graph.
Verify if an executable graph be launched on null stream.
*/
#define SIZE 1024
#define TEST_LOOP_SIZE 3
/**
* @addtogroup hipGraphLaunch hipGraphLaunch
* @{
* @ingroup GraphTest
* `hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream)` -
* Launches an executable graph in a stream
*/
TEST_CASE("Unit_hipGraphLaunch_Negative") {
hipError_t ret;
SECTION("Pass pGraphExec as nullptr") {
hipStream_t stream{};
static void HostFunctionSetToZero(void* arg) {
int* test_number = (int*)arg;
(*test_number) = 0;
}
static void HostFunctionAddOne(void* arg) {
int* test_number = (int*)arg;
(*test_number) += 1;
}
/* create an executable graph that will set an integer pointed to by 'number' to one*/
static void CreateTestExecutableGraph(hipGraphExec_t* graph_exec, int* number) {
hipGraph_t graph;
hipGraphNode_t node_error;
hipGraphNode_t node_set_zero;
hipHostNodeParams params_set_to_zero = {HostFunctionSetToZero, number};
hipGraphNode_t node_add_one;
hipHostNodeParams params_set_add_one = {HostFunctionAddOne, number};
HIP_CHECK(hipGraphCreate(&graph, 0));
HIP_CHECK(hipGraphAddHostNode(&node_set_zero, graph, nullptr, 0, &params_set_to_zero));
HIP_CHECK(hipGraphAddHostNode(&node_add_one, graph, &node_set_zero, 1, &params_set_add_one));
HIP_CHECK(hipGraphInstantiate(graph_exec, graph, &node_error, nullptr, 0));
HIP_CHECK(hipGraphDestroy(graph));
}
static void HipGraphLaunch_Positive_Simple(hipStream_t stream) {
int number = 5;
hipGraphExec_t graph_exec;
CreateTestExecutableGraph(&graph_exec, &number);
HIP_CHECK(hipGraphLaunch(graph_exec, stream));
HIP_CHECK(hipStreamSynchronize(stream));
REQUIRE(number == 1);
HIP_CHECK(hipGraphExecDestroy(graph_exec));
}
/**
* Test Description
* ------------------------
* - Basic positive test for hipGraphLaunch
* -# stream as a created stream
* -# with stream as hipStreamPerThread
* Test source
* ------------------------
* - unit/graph/hipGraphLaunch.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipGraphLaunch_Positive") {
SECTION("stream as a created stream") {
hipStream_t stream;
HIP_CHECK(hipStreamCreate(&stream));
HipGraphLaunch_Positive_Simple(stream);
HIP_CHECK(hipStreamDestroy(stream));
}
SECTION("with stream as hipStreamPerThread") {
HipGraphLaunch_Positive_Simple(hipStreamPerThread);
}
}
/**
* Test Description
* ------------------------
* - Negative parameter test for hipGraphLaunch
* -# graphExec is nullptr and stream is a created stream
* -# graphExec is nullptr and stream is hipStreamPerThread
* -# graphExec is an empty object
* -# graphExec is destroyed before calling hipGraphLaunch
* Test source
* ------------------------
* - unit/graph/hipGraphLaunch.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
*/
TEST_CASE("Unit_hipGraphLaunch_Negative_Parameters") {
SECTION("graphExec is nullptr and stream is a created stream") {
hipStream_t stream;
hipError_t ret;
HIP_CHECK(hipStreamCreate(&stream));
ret = hipGraphLaunch(nullptr, stream);
REQUIRE(hipErrorInvalidValue == ret);
}
SECTION("Pass pGraphExec as nullptr and stream as hipStreamPerThread") {
ret = hipGraphLaunch(nullptr, hipStreamPerThread);
REQUIRE(hipErrorInvalidValue == ret);
}
SECTION("Pass pGraphExec as empty object") {
hipGraphExec_t graphExec{};
hipStream_t stream{};
ret = hipGraphLaunch(graphExec, stream);
REQUIRE(hipErrorInvalidValue == ret);
}
SECTION("Destroy executable graph and try to launch it") {
constexpr size_t Nbytes = 1024;
hipGraph_t graph;
hipGraphExec_t graphExec;
hipStream_t stream;
hipGraphNode_t memsetNode;
char *devData;
HIP_CHECK(hipMalloc(&devData, Nbytes));
HIP_CHECK(hipGraphCreate(&graph, 0));
HIP_CHECK(hipStreamCreate(&stream));
hipMemsetParams memsetParams{};
memset(&memsetParams, 0, sizeof(memsetParams));
memsetParams.dst = reinterpret_cast<void*>(devData);
memsetParams.value = 0;
memsetParams.pitch = 0;
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
&memsetParams));
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, stream));
HIP_CHECK(hipStreamSynchronize(stream));
HIP_CHECK(hipGraphExecDestroy(graphExec));
// Launch again after destroy graph exec object.
ret = hipGraphLaunch(graphExec, stream);
REQUIRE(hipErrorInvalidValue == ret);
HIP_CHECK(hipFree(devData));
HIP_CHECK(hipGraphDestroy(graph));
HIP_CHECK(hipStreamDestroy(stream));
REQUIRE(ret == hipErrorInvalidValue);
}
/* In this case in CUDA setup this api call is giving - unknown error (999)
So enabling this test for both AMD and CUDA by checking with hipSuccess */
SECTION("Destroy stream and try to launch respective executable graph") {
constexpr size_t Nbytes = 1024;
hipGraph_t graph;
hipGraphExec_t graphExec;
hipStream_t stream;
hipGraphNode_t memsetNode;
char *devData;
HIP_CHECK(hipMalloc(&devData, Nbytes));
HIP_CHECK(hipGraphCreate(&graph, 0));
HIP_CHECK(hipStreamCreate(&stream));
hipMemsetParams memsetParams{};
memset(&memsetParams, 0, sizeof(memsetParams));
memsetParams.dst = reinterpret_cast<void*>(devData);
memsetParams.value = 0;
memsetParams.pitch = 0;
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
&memsetParams));
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, stream));
HIP_CHECK(hipStreamSynchronize(stream));
HIP_CHECK(hipStreamDestroy(stream));
// Launch again after destroy stream
ret = hipGraphLaunch(graphExec, stream);
REQUIRE(hipSuccess != ret);
HIP_CHECK(hipFree(devData));
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipGraphDestroy(graph));
SECTION("graphExec is nullptr and stream is hipStreamPerThread") {
HIP_CHECK_ERROR(hipGraphLaunch(nullptr, hipStreamPerThread), hipErrorInvalidValue);
}
SECTION("Destroy graph and try to launch respective executable graph") {
constexpr size_t Nbytes = 1024;
hipGraph_t graph;
hipGraphExec_t graphExec;
hipStream_t stream;
hipGraphNode_t memsetNode;
char *devData;
HIP_CHECK(hipMalloc(&devData, Nbytes));
SECTION("graphExec is an empty object") {
hipGraphExec_t graph_exec{};
HIP_CHECK_ERROR(hipGraphLaunch(graph_exec, hipStreamPerThread), hipErrorInvalidValue);
}
HIP_CHECK(hipGraphCreate(&graph, 0));
HIP_CHECK(hipStreamCreate(&stream));
hipMemsetParams memsetParams{};
memset(&memsetParams, 0, sizeof(memsetParams));
memsetParams.dst = reinterpret_cast<void*>(devData);
memsetParams.value = 0;
memsetParams.pitch = 0;
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
&memsetParams));
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, stream));
HIP_CHECK(hipStreamSynchronize(stream));
HIP_CHECK(hipGraphDestroy(graph));
// Launch again after destroy graph
ret = hipGraphLaunch(graphExec, stream);
REQUIRE(hipSuccess == ret);
HIP_CHECK(hipFree(devData));
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipStreamDestroy(stream));
SECTION("graphExec is destroyed") {
int number = 5;
hipGraphExec_t graph_exec;
CreateTestExecutableGraph(&graph_exec, &number);
HIP_CHECK(hipGraphLaunch(graph_exec, hipStreamPerThread));
HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
REQUIRE(number == 1);
HIP_CHECK(hipGraphExecDestroy(graph_exec));
HIP_CHECK_ERROR(hipGraphLaunch(graph_exec, hipStreamPerThread), hipErrorInvalidValue);
}
}
TEST_CASE("Unit_hipGraphLaunch_Functional_hipStreamPerThread") {
constexpr size_t N = 1024;
constexpr size_t Nbytes = N * sizeof(char);
constexpr size_t val = 0;
constexpr size_t updateVal = 2;
char *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr};
char *A_h{nullptr}, *B_h{nullptr};
HipTest::initArrays<char>(&A_d, &B_d, &C_d,
&A_h, &B_h, nullptr, N, false);
hipGraph_t graph;
hipGraphExec_t graphExec;
hipGraphNode_t memsetNode;
HIP_CHECK(hipGraphCreate(&graph, 0));
hipMemsetParams memsetParams{};
memset(&memsetParams, 0, sizeof(memsetParams));
memsetParams.dst = reinterpret_cast<void*>(C_d);
memsetParams.value = val;
memsetParams.pitch = 0;
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
&memsetParams));
std::vector<hipGraphNode_t> dependencies;
dependencies.push_back(memsetNode);
memset(&memsetParams, 0, sizeof(memsetParams));
memsetParams.dst = reinterpret_cast<void*>(A_d);
memsetParams.value = updateVal;
memsetParams.pitch = 0;
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, dependencies.data(),
dependencies.size(), &memsetParams));
HIP_CHECK(hipGraphMemsetNodeSetParams(memsetNode, &memsetParams));
dependencies.push_back(memsetNode);
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, hipStreamPerThread));
HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
// Validating the result
for (size_t i = 0; i < Nbytes; i++) {
if (A_h[i] != updateVal) {
WARN("Validation failed at- " << i << " A_h[i] " << A_h[i]);
REQUIRE(false);
}
}
HipTest::freeArrays<char>(A_d, B_d, C_d,
A_h, B_h, nullptr, false);
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipGraphDestroy(graph));
}
static void hipGraphLaunch_test() {
constexpr size_t N = 1024;
constexpr size_t Nbytes = N * sizeof(char);
constexpr size_t val = 0;
constexpr size_t updateVal = 1;
char *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr};
char *A_h{nullptr}, *B_h{nullptr};
HipTest::initArrays<char>(&A_d, &B_d, &C_d,
&A_h, &B_h, nullptr, N, false);
hipGraph_t graph;
hipGraphExec_t graphExec;
hipStream_t streamForGraph;
hipGraphNode_t memsetNode;
HIP_CHECK(hipGraphCreate(&graph, 0));
HIP_CHECK(hipStreamCreate(&streamForGraph));
hipMemsetParams memsetParams{};
memset(&memsetParams, 0, sizeof(memsetParams));
memsetParams.dst = reinterpret_cast<void*>(C_d);
memsetParams.value = val;
memsetParams.pitch = 0;
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
&memsetParams));
std::vector<hipGraphNode_t> dependencies;
dependencies.push_back(memsetNode);
memset(&memsetParams, 0, sizeof(memsetParams));
memsetParams.dst = reinterpret_cast<void*>(A_d);
memsetParams.value = updateVal;
memsetParams.pitch = 0;
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, dependencies.data(),
dependencies.size(), &memsetParams));
HIP_CHECK(hipGraphMemsetNodeSetParams(memsetNode, &memsetParams));
dependencies.push_back(memsetNode);
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
HIP_CHECK(hipStreamSynchronize(streamForGraph));
HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
// Validating the result
for (size_t i = 0; i < Nbytes; i++) {
if (A_h[i] != updateVal) {
WARN("Validation failed at- " << i << " A_h[i] " << A_h[i]);
REQUIRE(false);
}
}
HipTest::freeArrays<char>(A_d, B_d, C_d,
A_h, B_h, nullptr, false);
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipGraphDestroy(graph));
HIP_CHECK(hipStreamDestroy(streamForGraph));
}
TEST_CASE("Unit_hipGraphLaunch_Functional_multidevice_test") {
int numDevices = 0;
HIP_CHECK(hipGetDeviceCount(&numDevices));
if (numDevices > 0) {
for (int i = 0; i < numDevices; i++) {
HIP_CHECK(hipSetDevice(i));
hipGraphLaunch_test();
}
} else {
SUCCEED("Skipped the testcase as there is no device to test.");
}
}
// Function to fill input data
static void fillRandInpData(int *A1_h, int *A2_h, size_t N) {
unsigned int seed = time(nullptr);
for (size_t i = 0; i < N; i++) {
A1_h[i] = (HipTest::RAND_R(&seed) & 0xFF);
A2_h[i] = (HipTest::RAND_R(&seed) & 0xFF);
}
}
// Function to validate result
static void validateOutData(int *A1_h, int *A2_h, size_t N) {
for (size_t i = 0; i < N; i++) {
int result = (A1_h[i]*A1_h[i]);
REQUIRE(result == A2_h[i]);
}
}
/*
* 1.Create a graph with multiple nodes. Create an executable graph.
* Launch the executable graph 3 times in stream simultaneously.
* Wait for stream. Validate the output. No issues should be observed
* 2.Create a graph with multiple nodes. Create an executable graph.
* Verify if an executable graph be launched on null stream.
*/
TEST_CASE("Unit_hipGraphLaunch_Functional_MultipleLaunch") {
size_t memSize = SIZE;
constexpr auto blocksPerCU = 6; // to hide latency
constexpr auto threadsPerBlock = 256;
unsigned blocks = HipTest::setNumBlocks(blocksPerCU,
threadsPerBlock, SIZE);
hipGraph_t graph;
std::vector<hipGraphNode_t> nodeDependencies;
HIP_CHECK(hipGraphCreate(&graph, 0));
int *A_h{nullptr}, *A_d{nullptr}, *C_d{nullptr}, *C_h{nullptr};
HipTest::initArrays<int>(&A_d, &C_d, nullptr,
&A_h, &C_h, nullptr, SIZE, false);
hipGraphNode_t memcpyH2D, memcpyD2H, kernelNode;
// Create memcpy H2D nodes
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr,
0, A_d, A_h, (sizeof(int)*SIZE), hipMemcpyHostToDevice));
nodeDependencies.push_back(memcpyH2D);
// Creating kernel node
hipKernelNodeParams kerNodeParams;
void* kernelArgs[] = {reinterpret_cast<void*>(&A_d),
reinterpret_cast<void*>(&C_d),
reinterpret_cast<void*>(&memSize)};
kerNodeParams.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
kerNodeParams.gridDim = dim3(blocks);
kerNodeParams.blockDim = dim3(threadsPerBlock);
kerNodeParams.sharedMemBytes = 0;
kerNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
kerNodeParams.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(),
nodeDependencies.size(), &kerNodeParams));
nodeDependencies.clear();
nodeDependencies.push_back(kernelNode);
// Create memcpy D2H nodes
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nodeDependencies.data(),
nodeDependencies.size(), C_h, C_d, (sizeof(int)*SIZE),
hipMemcpyDeviceToHost));
nodeDependencies.clear();
// Create executable graph
hipStream_t streamForGraph;
hipGraphExec_t graphExec{nullptr};
HIP_CHECK(hipStreamCreate(&streamForGraph));
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr,
nullptr, 0));
// Execute graph
SECTION("Multiple Graph Launch") {
for (int iter = 0; iter < TEST_LOOP_SIZE; iter++) {
fillRandInpData(A_h, C_h, SIZE);
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
HIP_CHECK(hipStreamSynchronize(streamForGraph));
validateOutData(A_h, C_h, SIZE);
}
}
SECTION("Graph launch on Null stream") {
for (int iter = 0; iter < TEST_LOOP_SIZE; iter++) {
fillRandInpData(A_h, C_h, SIZE);
HIP_CHECK(hipGraphLaunch(graphExec, 0));
HIP_CHECK(hipStreamSynchronize(0));
validateOutData(A_h, C_h, SIZE);
}
}
HIP_CHECK(hipGraphDestroy(graph));
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipStreamDestroy(streamForGraph));
// Free
HipTest::freeArrays<int>(A_d, C_d, nullptr, A_h, C_h, nullptr, false);
}
+412
파일 보기
@@ -0,0 +1,412 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include <hip_test_checkers.hh>
#include <hip_test_kernels.hh>
/* Test verifies hipGraphLaunch API
Negative scenarios -
1) Pass graphExec as nullptr and verify api returns error code.
2) Pass pGraphExec as nullptr and stream as hipStreamPerThread and verify api returns error code.
3) Pass pGraphExec as empty object and verify api returns error code.
4) Destroy executable graph and try to launch it. Make sure api should not crash and it should returns error code.
5) Destroy stream and try to launch respective executable graph. Make sure api should not crash and it should returns error code.
6) Destroy actual graph created and try to launch respective executable graph.
Check api should execute properly without crash or error code.
Functional Scenario -
1) Check basic functionality with stream as hipStreamPerThread
2) Test hipGraphLaunch call on multiple devices.
3) Create a graph with multiple nodes. Create an executable graph.
Launch the executable graph 3 times in stream simultaneously.
Wait for stream. Validate the output. No issues should be observed
4) Create a graph with multiple nodes. Create an executable graph.
Verify if an executable graph be launched on null stream.
*/
#define SIZE 1024
#define TEST_LOOP_SIZE 3
TEST_CASE("Unit_hipGraphLaunch_Negative") {
hipError_t ret;
SECTION("Pass pGraphExec as nullptr") {
hipStream_t stream{};
ret = hipGraphLaunch(nullptr, stream);
REQUIRE(hipErrorInvalidValue == ret);
}
SECTION("Pass pGraphExec as nullptr and stream as hipStreamPerThread") {
ret = hipGraphLaunch(nullptr, hipStreamPerThread);
REQUIRE(hipErrorInvalidValue == ret);
}
SECTION("Pass pGraphExec as empty object") {
hipGraphExec_t graphExec{};
hipStream_t stream{};
ret = hipGraphLaunch(graphExec, stream);
REQUIRE(hipErrorInvalidValue == ret);
}
SECTION("Destroy executable graph and try to launch it") {
constexpr size_t Nbytes = 1024;
hipGraph_t graph;
hipGraphExec_t graphExec;
hipStream_t stream;
hipGraphNode_t memsetNode;
char *devData;
HIP_CHECK(hipMalloc(&devData, Nbytes));
HIP_CHECK(hipGraphCreate(&graph, 0));
HIP_CHECK(hipStreamCreate(&stream));
hipMemsetParams memsetParams{};
memset(&memsetParams, 0, sizeof(memsetParams));
memsetParams.dst = reinterpret_cast<void*>(devData);
memsetParams.value = 0;
memsetParams.pitch = 0;
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
&memsetParams));
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, stream));
HIP_CHECK(hipStreamSynchronize(stream));
HIP_CHECK(hipGraphExecDestroy(graphExec));
// Launch again after destroy graph exec object.
ret = hipGraphLaunch(graphExec, stream);
REQUIRE(hipErrorInvalidValue == ret);
HIP_CHECK(hipFree(devData));
HIP_CHECK(hipGraphDestroy(graph));
HIP_CHECK(hipStreamDestroy(stream));
}
/* In this case in CUDA setup this api call is giving - unknown error (999)
So enabling this test for both AMD and CUDA by checking with hipSuccess */
SECTION("Destroy stream and try to launch respective executable graph") {
constexpr size_t Nbytes = 1024;
hipGraph_t graph;
hipGraphExec_t graphExec;
hipStream_t stream;
hipGraphNode_t memsetNode;
char *devData;
HIP_CHECK(hipMalloc(&devData, Nbytes));
HIP_CHECK(hipGraphCreate(&graph, 0));
HIP_CHECK(hipStreamCreate(&stream));
hipMemsetParams memsetParams{};
memset(&memsetParams, 0, sizeof(memsetParams));
memsetParams.dst = reinterpret_cast<void*>(devData);
memsetParams.value = 0;
memsetParams.pitch = 0;
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
&memsetParams));
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, stream));
HIP_CHECK(hipStreamSynchronize(stream));
HIP_CHECK(hipStreamDestroy(stream));
// Launch again after destroy stream
ret = hipGraphLaunch(graphExec, stream);
REQUIRE(hipSuccess != ret);
HIP_CHECK(hipFree(devData));
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipGraphDestroy(graph));
}
SECTION("Destroy graph and try to launch respective executable graph") {
constexpr size_t Nbytes = 1024;
hipGraph_t graph;
hipGraphExec_t graphExec;
hipStream_t stream;
hipGraphNode_t memsetNode;
char *devData;
HIP_CHECK(hipMalloc(&devData, Nbytes));
HIP_CHECK(hipGraphCreate(&graph, 0));
HIP_CHECK(hipStreamCreate(&stream));
hipMemsetParams memsetParams{};
memset(&memsetParams, 0, sizeof(memsetParams));
memsetParams.dst = reinterpret_cast<void*>(devData);
memsetParams.value = 0;
memsetParams.pitch = 0;
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
&memsetParams));
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, stream));
HIP_CHECK(hipStreamSynchronize(stream));
HIP_CHECK(hipGraphDestroy(graph));
// Launch again after destroy graph
ret = hipGraphLaunch(graphExec, stream);
REQUIRE(hipSuccess == ret);
HIP_CHECK(hipFree(devData));
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipStreamDestroy(stream));
}
}
TEST_CASE("Unit_hipGraphLaunch_Functional_hipStreamPerThread") {
constexpr size_t N = 1024;
constexpr size_t Nbytes = N * sizeof(char);
constexpr size_t val = 0;
constexpr size_t updateVal = 2;
char *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr};
char *A_h{nullptr}, *B_h{nullptr};
HipTest::initArrays<char>(&A_d, &B_d, &C_d,
&A_h, &B_h, nullptr, N, false);
hipGraph_t graph;
hipGraphExec_t graphExec;
hipGraphNode_t memsetNode;
HIP_CHECK(hipGraphCreate(&graph, 0));
hipMemsetParams memsetParams{};
memset(&memsetParams, 0, sizeof(memsetParams));
memsetParams.dst = reinterpret_cast<void*>(C_d);
memsetParams.value = val;
memsetParams.pitch = 0;
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
&memsetParams));
std::vector<hipGraphNode_t> dependencies;
dependencies.push_back(memsetNode);
memset(&memsetParams, 0, sizeof(memsetParams));
memsetParams.dst = reinterpret_cast<void*>(A_d);
memsetParams.value = updateVal;
memsetParams.pitch = 0;
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, dependencies.data(),
dependencies.size(), &memsetParams));
HIP_CHECK(hipGraphMemsetNodeSetParams(memsetNode, &memsetParams));
dependencies.push_back(memsetNode);
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, hipStreamPerThread));
HIP_CHECK(hipStreamSynchronize(hipStreamPerThread));
HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
// Validating the result
for (size_t i = 0; i < Nbytes; i++) {
if (A_h[i] != updateVal) {
WARN("Validation failed at- " << i << " A_h[i] " << A_h[i]);
REQUIRE(false);
}
}
HipTest::freeArrays<char>(A_d, B_d, C_d,
A_h, B_h, nullptr, false);
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipGraphDestroy(graph));
}
static void hipGraphLaunch_test() {
constexpr size_t N = 1024;
constexpr size_t Nbytes = N * sizeof(char);
constexpr size_t val = 0;
constexpr size_t updateVal = 1;
char *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr};
char *A_h{nullptr}, *B_h{nullptr};
HipTest::initArrays<char>(&A_d, &B_d, &C_d,
&A_h, &B_h, nullptr, N, false);
hipGraph_t graph;
hipGraphExec_t graphExec;
hipStream_t streamForGraph;
hipGraphNode_t memsetNode;
HIP_CHECK(hipGraphCreate(&graph, 0));
HIP_CHECK(hipStreamCreate(&streamForGraph));
hipMemsetParams memsetParams{};
memset(&memsetParams, 0, sizeof(memsetParams));
memsetParams.dst = reinterpret_cast<void*>(C_d);
memsetParams.value = val;
memsetParams.pitch = 0;
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0,
&memsetParams));
std::vector<hipGraphNode_t> dependencies;
dependencies.push_back(memsetNode);
memset(&memsetParams, 0, sizeof(memsetParams));
memsetParams.dst = reinterpret_cast<void*>(A_d);
memsetParams.value = updateVal;
memsetParams.pitch = 0;
memsetParams.elementSize = sizeof(char);
memsetParams.width = Nbytes;
memsetParams.height = 1;
HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, dependencies.data(),
dependencies.size(), &memsetParams));
HIP_CHECK(hipGraphMemsetNodeSetParams(memsetNode, &memsetParams));
dependencies.push_back(memsetNode);
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
HIP_CHECK(hipStreamSynchronize(streamForGraph));
HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost));
// Validating the result
for (size_t i = 0; i < Nbytes; i++) {
if (A_h[i] != updateVal) {
WARN("Validation failed at- " << i << " A_h[i] " << A_h[i]);
REQUIRE(false);
}
}
HipTest::freeArrays<char>(A_d, B_d, C_d,
A_h, B_h, nullptr, false);
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipGraphDestroy(graph));
HIP_CHECK(hipStreamDestroy(streamForGraph));
}
TEST_CASE("Unit_hipGraphLaunch_Functional_multidevice_test") {
int numDevices = 0;
HIP_CHECK(hipGetDeviceCount(&numDevices));
if (numDevices > 0) {
for (int i = 0; i < numDevices; i++) {
HIP_CHECK(hipSetDevice(i));
hipGraphLaunch_test();
}
} else {
SUCCEED("Skipped the testcase as there is no device to test.");
}
}
// Function to fill input data
static void fillRandInpData(int *A1_h, int *A2_h, size_t N) {
unsigned int seed = time(nullptr);
for (size_t i = 0; i < N; i++) {
A1_h[i] = (HipTest::RAND_R(&seed) & 0xFF);
A2_h[i] = (HipTest::RAND_R(&seed) & 0xFF);
}
}
// Function to validate result
static void validateOutData(int *A1_h, int *A2_h, size_t N) {
for (size_t i = 0; i < N; i++) {
int result = (A1_h[i]*A1_h[i]);
REQUIRE(result == A2_h[i]);
}
}
/*
* 1.Create a graph with multiple nodes. Create an executable graph.
* Launch the executable graph 3 times in stream simultaneously.
* Wait for stream. Validate the output. No issues should be observed
* 2.Create a graph with multiple nodes. Create an executable graph.
* Verify if an executable graph be launched on null stream.
*/
TEST_CASE("Unit_hipGraphLaunch_Functional_MultipleLaunch") {
size_t memSize = SIZE;
constexpr auto blocksPerCU = 6; // to hide latency
constexpr auto threadsPerBlock = 256;
unsigned blocks = HipTest::setNumBlocks(blocksPerCU,
threadsPerBlock, SIZE);
hipGraph_t graph;
std::vector<hipGraphNode_t> nodeDependencies;
HIP_CHECK(hipGraphCreate(&graph, 0));
int *A_h{nullptr}, *A_d{nullptr}, *C_d{nullptr}, *C_h{nullptr};
HipTest::initArrays<int>(&A_d, &C_d, nullptr,
&A_h, &C_h, nullptr, SIZE, false);
hipGraphNode_t memcpyH2D, memcpyD2H, kernelNode;
// Create memcpy H2D nodes
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr,
0, A_d, A_h, (sizeof(int)*SIZE), hipMemcpyHostToDevice));
nodeDependencies.push_back(memcpyH2D);
// Creating kernel node
hipKernelNodeParams kerNodeParams;
void* kernelArgs[] = {reinterpret_cast<void*>(&A_d),
reinterpret_cast<void*>(&C_d),
reinterpret_cast<void*>(&memSize)};
kerNodeParams.func = reinterpret_cast<void*>(HipTest::vector_square<int>);
kerNodeParams.gridDim = dim3(blocks);
kerNodeParams.blockDim = dim3(threadsPerBlock);
kerNodeParams.sharedMemBytes = 0;
kerNodeParams.kernelParams = reinterpret_cast<void**>(kernelArgs);
kerNodeParams.extra = nullptr;
HIP_CHECK(hipGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(),
nodeDependencies.size(), &kerNodeParams));
nodeDependencies.clear();
nodeDependencies.push_back(kernelNode);
// Create memcpy D2H nodes
HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nodeDependencies.data(),
nodeDependencies.size(), C_h, C_d, (sizeof(int)*SIZE),
hipMemcpyDeviceToHost));
nodeDependencies.clear();
// Create executable graph
hipStream_t streamForGraph;
hipGraphExec_t graphExec{nullptr};
HIP_CHECK(hipStreamCreate(&streamForGraph));
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr,
nullptr, 0));
// Execute graph
SECTION("Multiple Graph Launch") {
for (int iter = 0; iter < TEST_LOOP_SIZE; iter++) {
fillRandInpData(A_h, C_h, SIZE);
HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph));
HIP_CHECK(hipStreamSynchronize(streamForGraph));
validateOutData(A_h, C_h, SIZE);
}
}
SECTION("Graph launch on Null stream") {
for (int iter = 0; iter < TEST_LOOP_SIZE; iter++) {
fillRandInpData(A_h, C_h, SIZE);
HIP_CHECK(hipGraphLaunch(graphExec, 0));
HIP_CHECK(hipStreamSynchronize(0));
validateOutData(A_h, C_h, SIZE);
}
}
HIP_CHECK(hipGraphDestroy(graph));
HIP_CHECK(hipGraphExecDestroy(graphExec));
HIP_CHECK(hipStreamDestroy(streamForGraph));
// Free
HipTest::freeArrays<int>(A_d, C_d, nullptr, A_h, C_h, nullptr, false);
}
+16 -6
파일 보기
@@ -261,9 +261,10 @@ TEST_CASE("Unit_hipGraphUpload_Functional_With_Priority_Stream") {
1) Pass graphExec node as nullptr.
2) Pass graphExec node as uninitialize object
3) Pass stream as uninitialize object
4) Graphexec is destroyed before upload
*/
TEST_CASE("Unit_hipGraphUpload_Negative_Argument_Check") {
TEST_CASE("Unit_hipGraphUpload_Negative_Parameters") {
hipGraphExec_t graphExec{};
hipError_t ret;
@@ -271,21 +272,30 @@ TEST_CASE("Unit_hipGraphUpload_Negative_Argument_Check") {
HIP_CHECK(hipStreamCreate(&stream));
SECTION("Pass graphExec node as nullptr") {
ret = hipGraphUpload(nullptr, stream);
REQUIRE(hipErrorInvalidValue == ret);
HIP_CHECK_ERROR(hipGraphUpload(nullptr, stream), hipErrorInvalidValue);
}
SECTION("Pass graphExec node as uninitialize object") {
ret = hipGraphUpload(graphExec, stream);
REQUIRE(hipErrorInvalidValue == ret);
HIP_CHECK_ERROR(hipGraphUpload(graphExec, stream), hipErrorInvalidValue);
}
SECTION("Pass stream as uninitialize object") {
hipStream_t stream1{};
hipGraph_t graph;
HIP_CHECK(hipGraphCreate(&graph, 0));
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0));
ret = hipGraphUpload(graphExec, stream1);
REQUIRE(hipSuccess == ret);
}
SECTION("graphExec is destroyed"){
hipGraphExec_t graph_exec;
hipGraph_t graph;
HIP_CHECK(hipGraphCreate(&graph, 0));
HIP_CHECK(hipGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0));
HIP_CHECK(hipGraphUpload(graph_exec, hipStreamPerThread));
HIP_CHECK(hipGraphExecDestroy(graph_exec));
HIP_CHECK_ERROR(hipGraphUpload(graph_exec, hipStreamPerThread), hipErrorInvalidValue);
}
HIP_CHECK(hipStreamDestroy(stream));
}
+14
파일 보기
@@ -4,9 +4,23 @@ set(TEST_SRC
hipOccupancyMaxActiveBlocksPerMultiprocessor_old.cc
hipOccupancyMaxPotentialBlockSize.cc
hipOccupancyMaxPotentialBlockSize_old.cc
hipModuleOccupancyMaxPotentialBlockSize.cc
hipModuleOccupancyMaxPotentialBlockSizeWithFlags.cc
hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.cc
hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.cc
hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags.cc
)
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/simple_kernel.code
COMMAND ${CMAKE_CXX_COMPILER} --genco --std=c++17
${CMAKE_CURRENT_SOURCE_DIR}/simple_kernel.cc
-o simple_kernel.code --rocm-path=${ROCM_PATH}
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/simple_kernel.cc)
add_custom_target(simple_kernel ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/simple_kernel.code)
hip_add_exe_to_target(NAME OccupancyTest
TEST_SRC ${TEST_SRC}
TEST_TARGET_NAME build_tests)
add_dependencies(OccupancyTest simple_kernel)
@@ -0,0 +1,92 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
Testcase Scenarios :
Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Positive_RangeValidation - Test correct
execution of hipModuleOccupancyMaxActiveBlocksPerMultiprocessor for diffrent parameter values
Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters - Test unsuccessful
execution of hipModuleOccupancyMaxActiveBlocksPerMultiprocessor api when parameters are invalid
*/
#include "occupancy_common.hh"
TEST_CASE("Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters") {
hipModule_t module;
hipFunction_t function;
int blockSize = 0;
int gridSize = 0;
HIP_CHECK(hipFree(nullptr));
HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
// Get potential blocksize
HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, 0, 0));
// Common negative tests
MaxActiveBlocksPerMultiprocessorNegative(
[&function](int* numBlocks, int blockSize, size_t dynSharedMemPerBlk) {
return hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, function, blockSize,
dynSharedMemPerBlk);
},
blockSize);
HIP_CHECK(hipModuleUnload(module));
}
TEST_CASE("Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Positive_RangeValidation") {
hipDeviceProp_t devProp;
hipModule_t module;
hipFunction_t function;
int blockSize = 0;
int gridSize = 0;
HIP_CHECK(hipFree(nullptr));
HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
SECTION("dynSharedMemPerBlk = 0") {
// Get potential blocksize
HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, 0, 0));
MaxActiveBlocksPerMultiprocessor(
[blockSize, &function](int* numBlocks) {
return hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, function, blockSize,
0);
},
blockSize, devProp.maxThreadsPerMultiProcessor);
}
SECTION("dynSharedMemPerBlk = sharedMemPerBlock") {
// Get potential blocksize
HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function,
devProp.sharedMemPerBlock, 0));
MaxActiveBlocksPerMultiprocessor(
[blockSize, devProp, &function](int* numBlocks) {
return hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, function, blockSize,
devProp.sharedMemPerBlock);
},
blockSize, devProp.maxThreadsPerMultiProcessor);
}
HIP_CHECK(hipModuleUnload(module));
}
@@ -0,0 +1,103 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
Testcase Scenarios :
Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Positive_RangeValidation - Test
correct execution of hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags for diffrent
parameter values
Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters - Test
unsuccessful execution of hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags api when
parameters are invalid
*/
#include "occupancy_common.hh"
TEST_CASE("Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters") {
hipModule_t module;
hipFunction_t function;
int numBlocks = 0;
int blockSize = 0;
int gridSize = 0;
HIP_CHECK(hipFree(nullptr));
HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
// Get potential blocksize
HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, 0, 0));
// Common negative tests
MaxActiveBlocksPerMultiprocessorNegative(
[&function](int* numBlocks, int blockSize, size_t dynSharedMemPerBlk) {
return hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
numBlocks, function, blockSize, dynSharedMemPerBlk, hipOccupancyDefault);
},
blockSize);
SECTION("Flag is invalid") {
// Only default flag is supported
HIP_CHECK_ERROR(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
&numBlocks, function, blockSize, 0, 2),
hipErrorInvalidValue);
}
HIP_CHECK(hipModuleUnload(module));
}
TEST_CASE(
"Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Positive_RangeValidation") {
hipDeviceProp_t devProp;
hipModule_t module;
hipFunction_t function;
int blockSize = 0;
int gridSize = 0;
HIP_CHECK(hipFree(nullptr));
HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
SECTION("dynSharedMemPerBlk = 0") {
// Get potential blocksize
HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, 0, 0));
MaxActiveBlocksPerMultiprocessor(
[blockSize, &function](int* numBlocks) {
return hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
numBlocks, function, blockSize, 0, hipOccupancyDefault);
},
blockSize, devProp.maxThreadsPerMultiProcessor);
}
SECTION("dynSharedMemPerBlk = sharedMemPerBlock") {
// Get potential blocksize
HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function,
devProp.sharedMemPerBlock, 0));
MaxActiveBlocksPerMultiprocessor(
[blockSize, devProp, &function](int* numBlocks) {
return hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
numBlocks, function, blockSize, devProp.sharedMemPerBlock, hipOccupancyDefault);
},
blockSize, devProp.maxThreadsPerMultiProcessor);
}
HIP_CHECK(hipModuleUnload(module));
}
@@ -0,0 +1,75 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
Testcase Scenarios :
Unit_hipModuleOccupancyMaxPotentialBlockSize_Positive_RangeValidation - Test correct execution of
hipModuleOccupancyMaxPotentialBlockSize for diffrent parameter values
Unit_hipModuleOccupancyMaxPotentialBlockSize_Negative_Parameters - Test unsuccessful execution of
hipModuleOccupancyMaxPotentialBlockSize api when parameters are invalid
*/
#include "occupancy_common.hh"
TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSize_Negative_Parameters") {
hipModule_t module;
hipFunction_t function;
HIP_CHECK(hipFree(nullptr));
HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
// Common negative tests
MaxPotentialBlockSizeNegative([&function](int* gridSize, int* blockSize) {
return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, function, 0, 0);
});
HIP_CHECK(hipModuleUnload(module));
}
TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSize_Positive_RangeValidation") {
hipDeviceProp_t devProp;
hipModule_t module;
hipFunction_t function;
HIP_CHECK(hipFree(nullptr));
HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
SECTION("dynSharedMemPerBlk = 0, blockSizeLimit = 0") {
MaxPotentialBlockSize(
[&function](int* gridSize, int* blockSize) {
return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, function, 0, 0);
},
devProp.maxThreadsPerBlock);
}
SECTION("dynSharedMemPerBlk = sharedMemPerBlock, blockSizeLimit = maxThreadsPerBlock") {
MaxPotentialBlockSize(
[&function, devProp](int* gridSize, int* blockSize) {
return hipModuleOccupancyMaxPotentialBlockSize(
gridSize, blockSize, function, devProp.sharedMemPerBlock, devProp.maxThreadsPerBlock);
},
devProp.maxThreadsPerBlock);
}
HIP_CHECK(hipModuleUnload(module));
}
@@ -0,0 +1,87 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
Testcase Scenarios :
Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Positive_RangeValidation - Test correct
execution of hipModuleOccupancyMaxPotentialBlockSizeWithFlags for diffrent parameter values
Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters - Test unsuccessful
execution of hipModuleOccupancyMaxPotentialBlockSizeWithFlags api when parameters are invalid
*/
#include "occupancy_common.hh"
TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters") {
hipModule_t module;
hipFunction_t function;
int blockSize = 0;
int gridSize = 0;
HIP_CHECK(hipFree(nullptr));
HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
// Common negative tests
MaxPotentialBlockSizeNegative([&function](int* gridSize, int* blockSize) {
return hipModuleOccupancyMaxPotentialBlockSizeWithFlags(gridSize, blockSize, function, 0, 0,
hipOccupancyDefault);
});
SECTION("Flag is invalid") {
// Only default flag is supported
HIP_CHECK_ERROR(
hipModuleOccupancyMaxPotentialBlockSizeWithFlags(&gridSize, &blockSize, function, 0, 0, 2),
hipErrorInvalidValue);
}
HIP_CHECK(hipModuleUnload(module));
}
TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Positive_RangeValidation") {
hipDeviceProp_t devProp;
hipModule_t module;
hipFunction_t function;
HIP_CHECK(hipFree(nullptr));
HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code"));
HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel"));
HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
SECTION("dynSharedMemPerBlk = 0, blockSizeLimit = 0") {
MaxPotentialBlockSize(
[&function](int* gridSize, int* blockSize) {
return hipModuleOccupancyMaxPotentialBlockSizeWithFlags(gridSize, blockSize, function, 0,
0, hipOccupancyDefault);
},
devProp.maxThreadsPerBlock);
}
SECTION("dynSharedMemPerBlk = sharedMemPerBlock, blockSizeLimit = maxThreadsPerBlock") {
MaxPotentialBlockSize(
[&function, devProp](int* gridSize, int* blockSize) {
return hipModuleOccupancyMaxPotentialBlockSizeWithFlags(
gridSize, blockSize, function, devProp.sharedMemPerBlock, devProp.maxThreadsPerBlock,
hipOccupancyDefault);
},
devProp.maxThreadsPerBlock);
}
HIP_CHECK(hipModuleUnload(module));
}
+1 -3
파일 보기
@@ -66,7 +66,5 @@ template <typename F> void MaxActiveBlocksPerMultiprocessorNegative(F func, int
SECTION("numBlocks is nullptr") {
HIP_CHECK_ERROR(func(nullptr, blockSize, 0), hipErrorInvalidValue);
}
SECTION("Block size is 0") {
HIP_CHECK_ERROR(func(&numBlocks, 0, 0), hipErrorInvalidValue);
}
SECTION("Block size is 0") { HIP_CHECK_ERROR(func(&numBlocks, 0, 0), hipErrorInvalidValue); }
}
+25
파일 보기
@@ -0,0 +1,25 @@
/*
Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "hip/hip_runtime.h"
extern "C" __global__ void SimpleKernel(int* a, int* b) {
int tx = threadIdx.x;
b[tx] = a[tx];
}
+9
파일 보기
@@ -0,0 +1,9 @@
# Common Tests - Test independent of all platforms
set(TEST_SRC
warp_shfl_xor.cc
warp_shfl.cc
)
hip_add_exe_to_target(NAME WarpTest
TEST_SRC ${TEST_SRC}
TEST_TARGET_NAME build_tests)
+84
파일 보기
@@ -0,0 +1,84 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>
static __device__ bool deactivate_thread(const uint64_t* const active_masks) {
const auto warp =
cooperative_groups::tiled_partition(cooperative_groups::this_thread_block(), warpSize);
const auto block = cooperative_groups::this_thread_block();
const auto warps_per_block = (block.size() + warpSize - 1) / warpSize;
const auto block_rank = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
const auto idx = block_rank * warps_per_block + block.thread_rank() / warpSize;
return !(active_masks[idx] & (static_cast<uint64_t>(1) << warp.thread_rank()));
}
static inline std::mt19937& GetRandomGenerator() {
static std::mt19937 mt(std::random_device{}());
return mt;
}
template <typename T> static inline T GenerateRandomInteger(const T min, const T max) {
std::uniform_int_distribution<T> dist(min, max);
return dist(GetRandomGenerator());
}
template <typename T> static inline T GenerateRandomReal(const T min, const T max) {
std::uniform_real_distribution<T> dist(min, max);
return dist(GetRandomGenerator());
}
inline int generate_width(int warp_size) {
int exponent = 0;
while (warp_size >>= 1) {
++exponent;
}
return GENERATE_COPY(map([](int e) { return 1 << e; }, range(1, exponent + 1)));
}
inline uint64_t get_active_mask(unsigned int warp_id, unsigned int warp_size) {
uint64_t active_mask = 0;
switch (warp_id % 5) {
case 0: // even threads in the warp
active_mask = 0xAAAAAAAAAAAAAAAA;
break;
case 1: // odd threads in the warp
active_mask = 0x5555555555555555;
break;
case 2: // first half of the warp
for (int i = 0; i < warp_size / 2; i++) {
active_mask = active_mask | (static_cast<uint64_t>(1) << i);
}
break;
case 3: // second half of the warp
for (int i = warp_size / 2; i < warp_size; i++) {
active_mask = active_mask | (static_cast<uint64_t>(1) << i);
}
break;
case 4: // all threads
active_mask = 0xFFFFFFFFFFFFFFFF;
break;
}
return active_mask;
}
+121
파일 보기
@@ -0,0 +1,121 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "warp_shfl_common.hh"
#include <bitset>
/**
* @addtogroup shfl shfl
* @{
* @ingroup DeviceLanguageTest
* `T __shfl(T var, int src_lane, int width = warpSize)` -
* Contains unit test for warp shfl function
*/
namespace cg = cooperative_groups;
template <typename T>
__global__ void shfl(T* const out, const T* const in, const uint64_t* const active_masks,
const uint8_t* const src_lanes, const int width) {
if (deactivate_thread(active_masks)) {
return;
}
const auto grid = cg::this_grid();
const auto block = cg::this_thread_block();
T var = in[grid.thread_rank()];
out[grid.thread_rank()] = __shfl(var, src_lanes[block.thread_rank() % width], width);
}
template <typename T> class WarpShfl : public WarpShflTest<WarpShfl<T>, T> {
public:
void launch_kernel(T* const arr_dev, T* const input_dev, const uint64_t* const active_masks) {
width_ = generate_width(this->warp_size_);
INFO("Width: " << width_);
const auto alloc_size = width_ * sizeof(uint8_t);
LinearAllocGuard<uint8_t> src_lanes_dev(LinearAllocs::hipMalloc, alloc_size);
src_lanes_.resize(width_);
std::generate(src_lanes_.begin(), src_lanes_.end(),
[this] { return GenerateRandomInteger(0, static_cast<int>(2 * width_)); });
HIP_CHECK(hipMemcpy(src_lanes_dev.ptr(), src_lanes_.data(), alloc_size, hipMemcpyHostToDevice));
shfl<<<this->grid_.grid_dim_, this->grid_.block_dim_>>>(arr_dev, input_dev, active_masks,
src_lanes_dev.ptr(), width_);
}
void validate(const T* const arr, const T* const input) {
ArrayAllOf(arr, this->grid_.thread_count_, [this, &input](unsigned int i) -> std::optional<T> {
const auto rank_in_block = this->grid_.thread_rank_in_block(i).value();
const auto rank_in_warp = rank_in_block % this->warp_size_;
const auto rank_in_partition = rank_in_block % width_;
const int src_lane = src_lanes_[rank_in_partition] % width_;
const int src_offset = src_lane - rank_in_partition;
const auto mask_idx = this->warps_in_block_ * (i / this->grid_.threads_in_block_count_) +
rank_in_block / this->warp_size_;
const std::bitset<sizeof(uint64_t) * 8> active_mask(this->active_masks_[mask_idx]);
if (!active_mask.test(rank_in_warp) || (!active_mask.test((rank_in_warp + src_offset))) ||
(rank_in_block + src_offset >= this->grid_.threads_in_block_count_)) {
return std::nullopt;
}
return input[i + src_offset];
});
};
private:
std::vector<uint8_t> src_lanes_;
int width_;
};
/**
* Test Description
* ------------------------
* - Validates the warp shuffle behavior for all valid width sizes {2, 4, 8, 16, 32,
* 64(if supported)} for generated shuffle target lanes. The threads are deactivated based on the
* passed active mask. The test is run for all overloads of shfl.
* Test source
* ------------------------
* - unit/warp/warp_shfl.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
* - Device supports warp shuffle
*/
TEMPLATE_TEST_CASE("Unit_Warp_Shfl_Positive_Basic", "", int, unsigned int, long, unsigned long,
long long, unsigned long long, float, double) {
int device;
hipDeviceProp_t device_properties;
HIP_CHECK(hipGetDevice(&device));
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
if (!device_properties.arch.hasWarpShuffle) {
HipTest::HIP_SKIP_TEST("Device doesn't support Warp Shuffle!");
return;
}
SECTION("Shfl with specified active mask and input values") {
WarpShfl<TestType>().run(false);
}
SECTION("Shfl with random active mask and input values") {
WarpShfl<TestType>().run(true);
}
}
+114
파일 보기
@@ -0,0 +1,114 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include "warp_common.hh"
#include <cpu_grid.h>
#include <resource_guards.hh>
#include <utils.hh>
template <typename Derived, typename T> class WarpShflTest {
public:
WarpShflTest() : warp_size_{get_warp_size()} {}
void run(bool random = false) {
const auto blocks = GenerateBlockDimensionsForShuffle();
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
const auto threads = GenerateThreadDimensionsForShuffle();
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
grid_ = CPUGrid(blocks, threads);
const auto alloc_size = grid_.thread_count_ * sizeof(T);
LinearAllocGuard<T> input_dev(LinearAllocs::hipMalloc, alloc_size);
LinearAllocGuard<T> input(LinearAllocs::hipHostMalloc, alloc_size);
LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
HIP_CHECK(hipMemset(arr_dev.ptr(), 0, alloc_size));
warps_in_block_ = (grid_.threads_in_block_count_ + warp_size_ - 1) / warp_size_;
const auto warps_in_grid = warps_in_block_ * grid_.block_count_;
LinearAllocGuard<uint64_t> active_masks_dev(LinearAllocs::hipMalloc,
warps_in_grid * sizeof(uint64_t));
active_masks_.resize(warps_in_grid);
generate_input(input.ptr(), random);
HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks_.data(),
warps_in_grid * sizeof(uint64_t), hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy(input_dev.ptr(), input.ptr(), alloc_size, hipMemcpyHostToDevice));
cast_to_derived().launch_kernel(arr_dev.ptr(), input_dev.ptr(), active_masks_dev.ptr());
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());
cast_to_derived().validate(arr.ptr(), input.ptr());
}
private:
int get_warp_size() const {
int current_dev = -1;
HIP_CHECK(hipGetDevice(&current_dev));
int warp_size = 0u;
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
return warp_size;
}
void generate_input(T* input, bool random) {
if (random) {
std::generate(active_masks_.begin(), active_masks_.end(), [] {
return GenerateRandomInteger<unsigned long long>(0ul, std::numeric_limits<uint64_t>().max());
});
if constexpr (std::is_same_v<float, T> || std::is_same_v<double, T>) {
std::generate_n(input, grid_.thread_count_, [] {
return static_cast<T>(
GenerateRandomReal(std::numeric_limits<T>().min(), std::numeric_limits<T>().max()));
});
} else {
std::generate_n(input, grid_.thread_count_, [] {
return static_cast<T>(GenerateRandomInteger(std::numeric_limits<T>().min(),
std::numeric_limits<T>().max()));
});
}
} else {
unsigned long long int i = 0;
std::generate(active_masks_.begin(), active_masks_.end(),
[this, &i]() { return get_active_mask(i++, warp_size_); });
i = 0;
std::generate_n(input, grid_.thread_count_, [&i]() {
if (static_cast<T>(i) > std::numeric_limits<T>().max())
i = 0;
else
i++;
return static_cast<T>(i);
});
}
}
Derived& cast_to_derived() { return reinterpret_cast<Derived&>(*this); }
protected:
const int warp_size_;
CPUGrid grid_;
unsigned int warps_in_block_;
std::vector<uint64_t> active_masks_;
};
+118
파일 보기
@@ -0,0 +1,118 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "warp_shfl_common.hh"
#include <bitset>
/**
* @addtogroup shfl_xor shfl_xor
* @{
* @ingroup DeviceLanguageTest
* `T __shfl_xor(T var, int lane_mask, int width = warpSize)` -
* Contains unit test for warp shfl_xor function
*/
namespace cg = cooperative_groups;
template <typename T>
__global__ void shfl_xor(T* const out, const T* const in, const uint64_t* const active_masks,
const int lane_mask, const int width) {
if (deactivate_thread(active_masks)) {
return;
}
const auto grid = cg::this_grid();
T var = in[grid.thread_rank()];
out[grid.thread_rank()] = __shfl_xor(var, lane_mask, width);
}
template <typename T> class WarpShflXOR : public WarpShflTest<WarpShflXOR<T>, T> {
public:
void launch_kernel(T* const arr_dev, T* const input_dev, const uint64_t* const active_masks) {
width_ = generate_width(this->warp_size_);
INFO("Width: " << width_);
lane_mask_ = GENERATE_COPY(range(0, this->warp_size_));
INFO("Lane mask: " << lane_mask_);
shfl_xor<<<this->grid_.grid_dim_, this->grid_.block_dim_>>>(arr_dev, input_dev, active_masks,
lane_mask_, width_);
}
void validate(const T* const arr, const T* const input) {
ArrayAllOf(arr, this->grid_.thread_count_, [this, &input](unsigned int i) -> std::optional<T> {
const auto rank_in_block = this->grid_.thread_rank_in_block(i).value();
const auto rank_in_warp = rank_in_block % this->warp_size_;
const int warp_target = rank_in_warp ^ this->lane_mask_;
const int target_offset = warp_target - rank_in_warp;
const auto mask_idx = this->warps_in_block_ * (i / this->grid_.threads_in_block_count_) +
rank_in_block / this->warp_size_;
const std::bitset<sizeof(uint64_t) * 8> active_mask(this->active_masks_[mask_idx]);
const auto target_partition = warp_target / width_;
const auto partition_rank = rank_in_warp / width_;
if (!active_mask.test(rank_in_warp) ||
(target_partition <= partition_rank && !active_mask.test(rank_in_warp + target_offset)) ||
(target_partition <= partition_rank &&
rank_in_block + target_offset >= this->grid_.threads_in_block_count_)) {
return std::nullopt;
}
return target_partition > partition_rank ? input[i] : input[i + target_offset];
});
};
private:
int lane_mask_;
int width_;
};
/**
* Test Description
* ------------------------
* - Validates the warp shuffle xor behavior for all valid width sizes {2, 4, 8, 16, 32,
* 64(if supported)} for mask values of [0, width). The threads are deactivated based on the
* passed active mask. The test is run for all overloads of shfl_xor.
* Test source
* ------------------------
* - unit/warp/warp_shfl_xor.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
* - Device supports warp shuffle
*/
TEMPLATE_TEST_CASE("Unit_Warp_Shfl_XOR_Positive_Basic", "", int, unsigned int, long, unsigned long,
long long, unsigned long long, float, double) {
int device;
hipDeviceProp_t device_properties;
HIP_CHECK(hipGetDevice(&device));
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
if (!device_properties.arch.hasWarpShuffle) {
HipTest::HIP_SKIP_TEST("Device doesn't support Warp Shuffle!");
return;
}
SECTION("Shfl Xor with specified active mask and input values") {
WarpShflXOR<TestType>().run(false);
}
SECTION("Shfl Xor with random active mask and input values") {
WarpShflXOR<TestType>().run(true);
}
}