diff --git a/projects/hip-tests/catch/hipTestMain/config/config_amd_linux b/projects/hip-tests/catch/hipTestMain/config/config_amd_linux index 6de829e0f4..0ac0b58e4d 100644 --- a/projects/hip-tests/catch/hipTestMain/config/config_amd_linux +++ b/projects/hip-tests/catch/hipTestMain/config/config_amd_linux @@ -54,6 +54,8 @@ "Unit_hipFuncSetAttribute_Positive_MaxDynamicSharedMemorySize_Not_Supported", "Unit_hipFuncSetAttribute_Positive_PreferredSharedMemoryCarveout_Not_Supported", "Unit_hipOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters", + "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters", + "Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters", "Unit_hipGraphMemcpyNodeSetParamsToSymbol_Positive_Basic", "Unit_hipGraphExecMemcpyNodeSetParamsToSymbol_Positive_Basic", "Unit_hipGraphExecMemcpyNodeSetParamsFromSymbol_Positive_Basic", @@ -183,6 +185,62 @@ "Unit_hipMemUnmap_negative", "=== SWDEV-432556,SWDEV-434211:Below test randomly failing in stress test ===", "Unit_hipDeviceGetUuid_From_RocmInfo", + "=== SWDEV-434171: Below tests took long time to complete in stress test on 17/11/23 ===", + "Unit_Warp_Shfl_Positive_Basic - int", + "Unit_Warp_Shfl_Positive_Basic - unsigned int", + "Unit_Warp_Shfl_Positive_Basic - long", + "Unit_Warp_Shfl_Positive_Basic - unsigned long", + "Unit_Warp_Shfl_Positive_Basic - long long", + "Unit_Warp_Shfl_Positive_Basic - unsigned long long", + "Unit_Warp_Shfl_Positive_Basic - float", + "Unit_Warp_Shfl_Positive_Basic - double", + "Unit_Warp_Shfl_XOR_Positive_Basic - int", + "Unit_Warp_Shfl_XOR_Positive_Basic - unsigned int", + "Unit_Warp_Shfl_XOR_Positive_Basic - long", + "Unit_Warp_Shfl_XOR_Positive_Basic - unsigned long", + "Unit_Warp_Shfl_XOR_Positive_Basic - long long", + "Unit_Warp_Shfl_XOR_Positive_Basic - unsigned long long", + "Unit_Warp_Shfl_XOR_Positive_Basic - float", + "Unit_Warp_Shfl_XOR_Positive_Basic - double", + "=== SWDEV-434878: Below tests failed in stress test on 24/11/23 ===", + "Unit_hipGraphUpload_Negative_Parameters", + "Unit_hipModuleOccupancyMaxPotentialBlockSize_Negative_Parameters", + "Unit_hipModuleOccupancyMaxPotentialBlockSize_Positive_RangeValidation", + "Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Positive_RangeValidation", + "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters", + "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Positive_RangeValidation", + "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Positive_RangeValidation", + "=== SWDEV-435667: Below tests failing randomly in stress test on 01/12/23 ===", + "Unit_atomicExch_Positive_Same_Address_Compile_Time - int", + "Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned int", + "Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long", + "Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long long", + "Unit_atomicExch_Positive_Same_Address_Compile_Time - float", + "Unit_atomicExch_Positive_Same_Address_Compile_Time - double", + "Unit_atomicExch_Positive_Multi_Kernel - int", + "Unit_atomicExch_Positive_Multi_Kernel - unsigned int", + "Unit_atomicExch_Positive_Multi_Kernel - unsigned long", + "Unit_atomicExch_Positive_Multi_Kernel - unsigned long long", + "Unit_atomicExch_Positive_Multi_Kernel - float", + "Unit_atomicExch_Positive_Multi_Kernel - double", + "Unit_atomicExch_system_Positive_Peer_GPUs - int", + "Unit_atomicExch_system_Positive_Peer_GPUs - unsigned int", + "Unit_atomicExch_system_Positive_Peer_GPUs - unsigned long", + "Unit_atomicExch_system_Positive_Peer_GPUs - unsigned long long", + "Unit_atomicExch_system_Positive_Peer_GPUs - float", + "Unit_atomicExch_system_Positive_Peer_GPUs - double", + "Unit_atomicExch_system_Positive_Host_And_GPU - int", + "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned int", + "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long", + "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long long", + "Unit_atomicExch_system_Positive_Host_And_GPU - float", + "Unit_atomicExch_system_Positive_Host_And_GPU - double", + "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - int", + "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned int", + "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned long", + "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned long long", + "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - float", + "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - double", #endif #if defined VEGA20 "=== SWDEV-419112 Below tests fail in stress test on 29/08/23 ===", diff --git a/projects/hip-tests/catch/hipTestMain/config/config_amd_windows b/projects/hip-tests/catch/hipTestMain/config/config_amd_windows index 0938722c5e..809325943b 100644 --- a/projects/hip-tests/catch/hipTestMain/config/config_amd_windows +++ b/projects/hip-tests/catch/hipTestMain/config/config_amd_windows @@ -119,6 +119,8 @@ "Unit_hipFuncSetAttribute_Positive_MaxDynamicSharedMemorySize_Not_Supported", "Unit_hipFuncSetAttribute_Positive_PreferredSharedMemoryCarveout_Not_Supported", "Unit_hipOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters", + "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters", + "Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters", "Unit_hipGraphMemcpyNodeSetParamsToSymbol_Positive_Basic", "Unit_hipGraphExecMemcpyNodeSetParamsToSymbol_Positive_Basic", "Unit_hipGraphMemcpyNodeSetParamsFromSymbol_Positive_Basic", @@ -282,6 +284,62 @@ "Unit_hipMemSetAccess_MultiProc", "Unit_hipMemSetAccess_negative", "Unit_hipMemUnmap_negative", + "=== SWDEV-434171: Below tests took long time to complete in stress test on 17/11/23 ===", + "Unit_Warp_Shfl_Positive_Basic - int", + "Unit_Warp_Shfl_Positive_Basic - unsigned int", + "Unit_Warp_Shfl_Positive_Basic - long", + "Unit_Warp_Shfl_Positive_Basic - unsigned long", + "Unit_Warp_Shfl_Positive_Basic - long long", + "Unit_Warp_Shfl_Positive_Basic - unsigned long long", + "Unit_Warp_Shfl_Positive_Basic - float", + "Unit_Warp_Shfl_Positive_Basic - double", + "Unit_Warp_Shfl_XOR_Positive_Basic - int", + "Unit_Warp_Shfl_XOR_Positive_Basic - unsigned int", + "Unit_Warp_Shfl_XOR_Positive_Basic - long", + "Unit_Warp_Shfl_XOR_Positive_Basic - unsigned long", + "Unit_Warp_Shfl_XOR_Positive_Basic - long long", + "Unit_Warp_Shfl_XOR_Positive_Basic - unsigned long long", + "Unit_Warp_Shfl_XOR_Positive_Basic - float", + "Unit_Warp_Shfl_XOR_Positive_Basic - double", + "=== SWDEV-434878: Below tests failed in stress test on 24/11/23 ===", + "Unit_hipGraphUpload_Negative_Parameters", + "Unit_hipModuleOccupancyMaxPotentialBlockSize_Negative_Parameters", + "Unit_hipModuleOccupancyMaxPotentialBlockSize_Positive_RangeValidation", + "Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Positive_RangeValidation", + "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters", + "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Positive_RangeValidation", + "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Positive_RangeValidation", + "=== SWDEV-435667: Below tests failing randomly in stress test on 01/12/23 ===", + "Unit_atomicExch_Positive_Same_Address_Compile_Time - int", + "Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned int", + "Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long", + "Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long long", + "Unit_atomicExch_Positive_Same_Address_Compile_Time - float", + "Unit_atomicExch_Positive_Same_Address_Compile_Time - double", + "Unit_atomicExch_Positive_Multi_Kernel - int", + "Unit_atomicExch_Positive_Multi_Kernel - unsigned int", + "Unit_atomicExch_Positive_Multi_Kernel - unsigned long", + "Unit_atomicExch_Positive_Multi_Kernel - unsigned long long", + "Unit_atomicExch_Positive_Multi_Kernel - float", + "Unit_atomicExch_Positive_Multi_Kernel - double", + "Unit_atomicExch_system_Positive_Peer_GPUs - int", + "Unit_atomicExch_system_Positive_Peer_GPUs - unsigned int", + "Unit_atomicExch_system_Positive_Peer_GPUs - unsigned long", + "Unit_atomicExch_system_Positive_Peer_GPUs - unsigned long long", + "Unit_atomicExch_system_Positive_Peer_GPUs - float", + "Unit_atomicExch_system_Positive_Peer_GPUs - double", + "Unit_atomicExch_system_Positive_Host_And_GPU - int", + "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned int", + "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long", + "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long long", + "Unit_atomicExch_system_Positive_Host_And_GPU - float", + "Unit_atomicExch_system_Positive_Host_And_GPU - double", + "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - int", + "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned int", + "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned long", + "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - unsigned long long", + "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - float", + "Unit_atomicExch_system_Positive_Host_And_Peer_GPUs - double", #endif "End of json" ] diff --git a/projects/hip-tests/catch/hipTestMain/config/config_nvidia_linux.json b/projects/hip-tests/catch/hipTestMain/config/config_nvidia_linux.json index ae93da67d6..b0bf56894d 100644 --- a/projects/hip-tests/catch/hipTestMain/config/config_nvidia_linux.json +++ b/projects/hip-tests/catch/hipTestMain/config/config_nvidia_linux.json @@ -44,6 +44,14 @@ "Grid_Group_Getters_Via_Non_Member_Functions_Positive_Basic", "Grid_Group_Sync_Positive_Basic", "dynamic_loading_device_kernels_from_library", - "Unit_tiled_partition" + "Unit_tiled_partition", + "Unit_atomicExch_Positive_Same_Address_Compile_Time - int", + "Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned int", + "Unit_atomicExch_Positive_Same_Address_Compile_Time - unsigned long long", + "Unit_atomicExch_Positive_Same_Address_Compile_Time - float", + "Unit_atomicExch_system_Positive_Host_And_GPU - int", + "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned int", + "Unit_atomicExch_system_Positive_Host_And_GPU - unsigned long long", + "Unit_atomicExch_system_Positive_Host_And_GPU - float" ] } diff --git a/projects/hip-tests/catch/hipTestMain/main.cc b/projects/hip-tests/catch/hipTestMain/main.cc index 030267f04c..74096c4af4 100644 --- a/projects/hip-tests/catch/hipTestMain/main.cc +++ b/projects/hip-tests/catch/hipTestMain/main.cc @@ -30,9 +30,9 @@ int main(int argc, char** argv) { | Opt(cmd_options.progress) ["-P"]["--progress"] ("Show progress bar when running performance tests") - | Opt(cmd_options.extended_run) - ["-E"]["--extended-run"] - ("TODO: Description goes here") + | Opt(cmd_options.cg_extended_run, "cg_extened_run") + ["-E"]["--cg-extended-run"] + ("Number of iterations used for cooperative groups sync tests (default: 5)") ; // clang-format on diff --git a/projects/hip-tests/catch/include/cmd_options.hh b/projects/hip-tests/catch/include/cmd_options.hh index 5dbd2f300c..99e565491b 100644 --- a/projects/hip-tests/catch/include/cmd_options.hh +++ b/projects/hip-tests/catch/include/cmd_options.hh @@ -23,11 +23,11 @@ THE SOFTWARE. #pragma once struct CmdOptions { - int iterations = 1000; + int iterations = 10; int warmups = 100; + int cg_extended_run = 5; bool no_display = false; bool progress = false; - bool extended_run = false; }; extern CmdOptions cmd_options; \ No newline at end of file diff --git a/projects/hip-tests/catch/include/cpu_grid.h b/projects/hip-tests/catch/include/cpu_grid.h index 98e5521840..6f0b7edbca 100644 --- a/projects/hip-tests/catch/include/cpu_grid.h +++ b/projects/hip-tests/catch/include/cpu_grid.h @@ -78,6 +78,7 @@ struct CPUGrid { unsigned int thread_count_; }; +/* Generate dimensions for 1D, 2D and 3D blocks of threads */ inline dim3 GenerateThreadDimensions() { hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, 0)); @@ -99,6 +100,7 @@ inline dim3 GenerateThreadDimensions() { dim3(props.warpSize + 1, 3, 3)); } +/* Generate dimensions for 1D, 2D and 3D grids of blocks */ inline dim3 GenerateBlockDimensions() { hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, 0)); @@ -116,6 +118,7 @@ inline dim3 GenerateBlockDimensions() { dim3(5, 5, 5)); } +/* Generate dimensions for 1D, 2D and 3D blocks of threads - reduced set */ inline dim3 GenerateThreadDimensionsForShuffle() { hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, 0)); @@ -136,6 +139,7 @@ inline dim3 GenerateThreadDimensionsForShuffle() { dim3(props.warpSize + 1, 3, 3)); } +/* Generate dimensions for 1D, 2D and 3D grids of blocks - reduced set */ inline dim3 GenerateBlockDimensionsForShuffle() { hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, 0)); diff --git a/projects/hip-tests/catch/include/hip_test_common.hh b/projects/hip-tests/catch/include/hip_test_common.hh index 9274bbf6db..147abe0941 100644 --- a/projects/hip-tests/catch/include/hip_test_common.hh +++ b/projects/hip-tests/catch/include/hip_test_common.hh @@ -102,6 +102,19 @@ THE SOFTWARE. } \ } +// Check that an expression, errorExpr, evaluates to the expected error_t, expectedError. +#define HIPRTC_CHECK_ERROR(errorExpr, expectedError) \ + { \ + auto localError = errorExpr; \ + INFO("Matching Errors: " \ + << "\n Expected Error: " << hiprtcGetErrorString(expectedError) \ + << "\n Expected Code: " << expectedError << '\n' \ + << " Actual Error: " << hiprtcGetErrorString(localError) \ + << "\n Actual Code: " << localError << "\nStr: " << #errorExpr \ + << "\n In File: " << __FILE__ << "\n At line: " << __LINE__); \ + REQUIRE(localError == expectedError); \ + } + // Although its assert, it will be evaluated at runtime #define HIP_ASSERT(x) \ { REQUIRE((x)); } diff --git a/projects/hip-tests/catch/include/hip_test_defgroups.hh b/projects/hip-tests/catch/include/hip_test_defgroups.hh index d16c5a0d4a..5294e11a5c 100644 --- a/projects/hip-tests/catch/include/hip_test_defgroups.hh +++ b/projects/hip-tests/catch/include/hip_test_defgroups.hh @@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + // Test groups are named based on the group names from hip_api_runtime.h, with adding "Test" suffix /** @@ -95,8 +97,46 @@ THE SOFTWARE. /** * @defgroup KernelTest Kernel Functions Management +* @{ +* This section describes the various kernel functions invocation. +* @} +*/ + +/** + * @defgroup AtomicsTest Device Atomics * @{ - * This section describes the various kernel functions invocation. + * This section describes tests for the Device Atomic APIs. + * @} + */ + + /** + * @addtogroup atomicExch atomicExch + * @{ + * @ingroup AtomicsTest + */ + +/** + * Test Description + * ------------------------ + * - Compiles atomicExch with invalid parameters. + * - Compiles the source with specialized Python tool. + * -# Utilizes sub-process to invoke compilation of faulty source. + * -# Performs post-processing of compiler output and counts errors. + * Test source + * ------------------------ + * - unit/atomics/CMakeLists.txt + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_atomicExch_Negative_Parameters") {} +/** + * End doxygen group atomicExch. + * @} + */ + +/** + * End doxygen group AtomicsTest. * @} */ @@ -115,7 +155,14 @@ THE SOFTWARE. * @} */ - /** +/** + * @defgroup PerformanceTest Performance tests + * @{ + * This section describes performance tests for the target API groups and use-cases. + * @} + */ + +/** * @defgroup ShflTest warp shuffle function Management * @{ * This section describes the warp shuffle types & functions of HIP runtime API. diff --git a/projects/hip-tests/catch/include/performance_common.hh b/projects/hip-tests/catch/include/performance_common.hh index 21e87ceb2c..af6ba7609f 100644 --- a/projects/hip-tests/catch/include/performance_common.hh +++ b/projects/hip-tests/catch/include/performance_common.hh @@ -34,6 +34,7 @@ THE SOFTWARE. #include #pragma clang diagnostic ignored "-Wunused-but-set-variable" +#pragma clang diagnostic ignored "-Wunused-parameter" #pragma clang diagnostic ignored "-Wunused-function" #if defined(_WIN32) diff --git a/projects/hip-tests/catch/include/resource_guards.hh b/projects/hip-tests/catch/include/resource_guards.hh index 0f8697e121..3ec6aabf8d 100644 --- a/projects/hip-tests/catch/include/resource_guards.hh +++ b/projects/hip-tests/catch/include/resource_guards.hh @@ -29,10 +29,30 @@ enum class LinearAllocs { hipHostMalloc, hipMalloc, hipMallocManaged, + noAlloc }; +inline std::string to_string(const LinearAllocs allocation_type) { + switch (allocation_type) { + case LinearAllocs::malloc: + return "host pageable"; + case LinearAllocs::mallocAndRegister: + return "registered"; + case LinearAllocs::hipHostMalloc: + return "host pinned"; + case LinearAllocs::hipMalloc: + return "device malloc"; + case LinearAllocs::hipMallocManaged: + return "managed"; + default: + return "unknown alloc type"; + } +} + template class LinearAllocGuard { public: + LinearAllocGuard() = default; + LinearAllocGuard(const LinearAllocs allocation_type, const size_t size, const unsigned int flags = 0u) : allocation_type_{allocation_type} { @@ -55,15 +75,36 @@ template class LinearAllocGuard { case LinearAllocs::hipMallocManaged: HIP_CHECK(hipMallocManaged(reinterpret_cast(&ptr_), size, flags ? flags : 1u)); host_ptr_ = ptr_; + break; + case LinearAllocs::noAlloc: + break; } } LinearAllocGuard(const LinearAllocGuard&) = delete; - LinearAllocGuard(LinearAllocGuard&&) = delete; + + LinearAllocGuard(LinearAllocGuard&& o) + : allocation_type_{o.allocation_type_}, ptr_{o.ptr_}, host_ptr_{o.host_ptr_} { + o.allocation_type_ = LinearAllocs::noAlloc; + o.ptr_ = nullptr; + o.host_ptr_ = nullptr; + } + + LinearAllocGuard& operator=(LinearAllocGuard&& o) { + allocation_type_ = o.allocation_type_; + ptr_ = o.ptr_; + host_ptr_ = o.host_ptr_; + + o.allocation_type_ = LinearAllocs::noAlloc; + o.ptr_ = nullptr; + o.host_ptr_ = nullptr; + } ~LinearAllocGuard() { // No Catch macros, don't want to possibly throw in the destructor switch (allocation_type_) { + case LinearAllocs::noAlloc: + break; case LinearAllocs::malloc: free(ptr_); break; @@ -85,7 +126,7 @@ template class LinearAllocGuard { T* host_ptr() const { return host_ptr_; } private: - const LinearAllocs allocation_type_; + LinearAllocs allocation_type_ = LinearAllocs::noAlloc; T* ptr_ = nullptr; T* host_ptr_ = nullptr; }; @@ -200,7 +241,10 @@ enum class Streams { nullstream, perThread, created, withFlags, withPriority }; class StreamGuard { public: - StreamGuard(const Streams stream_type, unsigned int flags = hipStreamDefault, int priority = 0) : stream_type_{stream_type}, flags_{flags}, priority_{priority} { + StreamGuard() = default; + + StreamGuard(const Streams stream_type, unsigned int flags = hipStreamDefault, int priority = 0) + : stream_type_{stream_type}, flags_{flags}, priority_{priority} { switch (stream_type_) { case Streams::nullstream: stream_ = nullptr; @@ -219,7 +263,28 @@ class StreamGuard { } StreamGuard(const StreamGuard&) = delete; - StreamGuard(StreamGuard&&) = delete; + + StreamGuard(StreamGuard&& o) + : stream_type_{o.stream_type_}, flags_{o.flags_}, priority_{o.priority_}, stream_{o.stream_} { + o.stream_type_ = Streams::nullstream; + o.flags_ = 0u; + o.priority_ = 0; + o.stream_ = nullptr; + } + + StreamGuard& operator=(StreamGuard&& o) { + stream_type_ = o.stream_type_; + flags_ = o.flags_; + priority_ = o.priority_; + stream_ = o.stream_; + + o.stream_type_ = Streams::nullstream; + o.flags_ = 0u; + o.priority_ = 0; + o.stream_ = nullptr; + + return *this; + } ~StreamGuard() { if (stream_type_ == Streams::created) { @@ -230,23 +295,23 @@ class StreamGuard { hipStream_t stream() const { return stream_; } private: - const Streams stream_type_; - unsigned int flags_; - int priority_; - hipStream_t stream_; + Streams stream_type_ = Streams::nullstream; + unsigned int flags_ = 0u; + int priority_ = 0; + hipStream_t stream_ = nullptr; }; class EventsGuard { -public: + public: EventsGuard(size_t N) : events_(N) { - for (auto &e : events_) HIP_CHECK(hipEventCreate(&e)); + for (auto& e : events_) HIP_CHECK(hipEventCreate(&e)); } EventsGuard(const EventsGuard&) = delete; EventsGuard(EventsGuard&&) = delete; ~EventsGuard() { - for (auto &e : events_) static_cast(hipEventDestroy(e)); + for (auto& e : events_) static_cast(hipEventDestroy(e)); } hipEvent_t& operator[](int index) { return events_[index]; } @@ -255,21 +320,21 @@ public: std::vector& event_list() { return events_; } -private: + private: std::vector events_; }; class StreamsGuard { -public: + public: StreamsGuard(size_t N) : streams_(N) { - for (auto &s : streams_) HIP_CHECK(hipStreamCreate(&s)); + for (auto& s : streams_) HIP_CHECK(hipStreamCreate(&s)); } StreamsGuard(const StreamsGuard&) = delete; StreamsGuard(StreamsGuard&&) = delete; ~StreamsGuard() { - for (auto &s : streams_) static_cast(hipStreamDestroy(s)); + for (auto& s : streams_) static_cast(hipStreamDestroy(s)); } hipStream_t& operator[](int index) { return streams_[index]; } @@ -278,6 +343,6 @@ public: std::vector& stream_list() { return streams_; } -private: + private: std::vector streams_; }; diff --git a/projects/hip-tests/catch/performance/CMakeLists.txt b/projects/hip-tests/catch/performance/CMakeLists.txt index e1e159c6e5..0c6962c596 100644 --- a/projects/hip-tests/catch/performance/CMakeLists.txt +++ b/projects/hip-tests/catch/performance/CMakeLists.txt @@ -18,5 +18,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. +add_subdirectory(stream) add_subdirectory(event) add_subdirectory(example) diff --git a/projects/hip-tests/catch/performance/stream/CMakeLists.txt b/projects/hip-tests/catch/performance/stream/CMakeLists.txt new file mode 100644 index 0000000000..f9504d63da --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/CMakeLists.txt @@ -0,0 +1,63 @@ +# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +if(HIP_PLATFORM MATCHES "amd") +set(TEST_SRC + hipStreamWaitEvent.cc + hipStreamGetFlags.cc + hipStreamGetPriority.cc + hipExtStreamCreateWithCUMask.cc + hipExtStreamGetCUMask.cc + hipStreamAddCallback.cc + hipStreamWaitValue.cc + hipStreamWriteValue.cc + hipMallocAsync.cc + hipFreeAsync.cc + hipMemPoolCreate.cc + hipMemPoolDestroy.cc + hipMemPoolTrimTo.cc + hipMemPoolSetAttribute.cc + hipMemPoolGetAttribute.cc + hipMemPoolSetAccess.cc + hipMallocFromPoolAsync.cc + hipMemPoolExportToShareableHandle.cc + hipMemPoolImportFromShareableHandle.cc + hipMemPoolExportPointer.cc + hipMemPoolImportPointer.cc + hipStreamBasic.cc +) +else() +set(TEST_SRC + hipStreamWaitEvent.cc + hipStreamGetFlags.cc + hipStreamGetPriority.cc + hipStreamAddCallback.cc + hipStreamWaitValue.cc + hipStreamWriteValue.cc + hipMallocAsync.cc + hipFreeAsync.cc + hipStreamBasic.cc +) +endif() + +hip_add_exe_to_target(NAME StreamPerformance + TEST_SRC ${TEST_SRC} + TEST_TARGET_NAME build_tests + COMPILE_OPTIONS -std=c++17) diff --git a/projects/hip-tests/catch/performance/stream/hipExtStreamCreateWithCUMask.cc b/projects/hip-tests/catch/performance/stream/hipExtStreamCreateWithCUMask.cc new file mode 100644 index 0000000000..f09ae8e976 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipExtStreamCreateWithCUMask.cc @@ -0,0 +1,65 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + * Contains performance tests for all stream management HIP APIs. + */ + +class ExtStreamCreateWithCUMaskBenchmark : public Benchmark { + public: + void operator()() { + hipDeviceProp_t props; + HIP_CHECK(hipGetDeviceProperties(&props, 0)); + std::vector cu_mask(props.multiProcessorCount, 0); + hipStream_t stream{}; + + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipExtStreamCreateWithCUMask(&stream, cu_mask.size(), cu_mask.data())); + } + + HIP_CHECK(hipStreamDestroy(stream)); + } +}; + +static void RunBenchmark() { + ExtStreamCreateWithCUMaskBenchmark benchmark; + benchmark.Run(); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipExtStreamCreateWithCUMask`. + * Test source + * ------------------------ + * - performance/stream/hipExtStreamCreateWithCUMask.cc + * Test requirements + * ------------------------ + * - Platform specific (AMD) + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipExtStreamCreateWithCUMask") { + RunBenchmark(); +} diff --git a/projects/hip-tests/catch/performance/stream/hipExtStreamGetCUMask.cc b/projects/hip-tests/catch/performance/stream/hipExtStreamGetCUMask.cc new file mode 100644 index 0000000000..1dd06aaed6 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipExtStreamGetCUMask.cc @@ -0,0 +1,67 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class ExtStreamGetCUMaskBenchmark : public Benchmark { + public: + void operator()() { + hipDeviceProp_t props; + HIP_CHECK(hipGetDeviceProperties(&props, 0)); + std::vector cu_mask(props.multiProcessorCount, 0); + hipStream_t stream{}; + HIP_CHECK(hipExtStreamCreateWithCUMask(&stream, cu_mask.size(), cu_mask.data())); + std::vector new_cu_mask(cu_mask.size(), 0); + + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipExtStreamGetCUMask(stream, new_cu_mask.size(), new_cu_mask.data())); + } + + HIP_CHECK(hipStreamDestroy(stream)); + } +}; + +static void RunBenchmark() { + ExtStreamGetCUMaskBenchmark benchmark; + benchmark.Run(); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipExtStreamGetCUMask`. + * - Creates basic mask and gets it into the new one. + * Test source + * ------------------------ + * - performance/stream/hipExtStreamGetCUMask.cc + * Test requirements + * ------------------------ + * - Platform specific (AMD) + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipExtStreamGetCUMask") { + RunBenchmark(); +} diff --git a/projects/hip-tests/catch/performance/stream/hipFreeAsync.cc b/projects/hip-tests/catch/performance/stream/hipFreeAsync.cc new file mode 100644 index 0000000000..0e21ac4b2a --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipFreeAsync.cc @@ -0,0 +1,69 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class FreeAsyncBenchmark : public Benchmark { + public: + void operator()(const size_t array_size) { + const StreamGuard stream_guard{Streams::created}; + const hipStream_t stream = stream_guard.stream(); + float* dev_ptr{nullptr}; + HIP_CHECK(hipMallocAsync(reinterpret_cast(&dev_ptr), array_size * sizeof(float), stream)); + + TIMED_SECTION_STREAM(kTimerTypeEvent, stream) { + HIP_CHECK(hipFreeAsync(dev_ptr, stream)); + } + + HIP_CHECK(hipStreamSynchronize(stream)); + } +}; + +static void RunBenchmark(const size_t array_size) { + FreeAsyncBenchmark benchmark; + benchmark.AddSectionName(std::to_string(array_size)); + benchmark.Run(array_size); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipFreeAsync` with created stream: + * -# Allocation size: + * - 4 KB + * - 4 MB + * - 16 MB + * Test source + * ------------------------ + * - performance/stream/hipFreeAsync.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipFreeAsync") { + size_t array_size = GENERATE(4_KB, 4_MB, 16_MB); + RunBenchmark(array_size); +} diff --git a/projects/hip-tests/catch/performance/stream/hipMallocAsync.cc b/projects/hip-tests/catch/performance/stream/hipMallocAsync.cc new file mode 100644 index 0000000000..8bae9e0bfe --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipMallocAsync.cc @@ -0,0 +1,68 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class MallocAsyncBenchmark : public Benchmark { + public: + void operator()(const size_t array_size) { + const StreamGuard stream_guard{Streams::created}; + const hipStream_t stream = stream_guard.stream(); + float* dev_ptr{nullptr}; + + TIMED_SECTION_STREAM(kTimerTypeEvent, stream) { + HIP_CHECK(hipMallocAsync(reinterpret_cast(&dev_ptr), array_size * sizeof(float), stream)); + } + HIP_CHECK(hipStreamSynchronize(stream)); + HIP_CHECK(hipFree(dev_ptr)); + } +}; + +static void RunBenchmark(const size_t array_size) { + MallocAsyncBenchmark benchmark; + benchmark.AddSectionName(std::to_string(array_size)); + benchmark.Run(array_size); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipMallocAsync` with created stream: + * -# Allocation size: + * - 4 KB + * - 4 MB + * - 16 MB + * Test source + * ------------------------ + * - performance/stream/hipMallocAsync.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipMallocAsync") { + size_t array_size = GENERATE(4_KB, 4_MB, 16_MB); + RunBenchmark(array_size); +} diff --git a/projects/hip-tests/catch/performance/stream/hipMallocFromPoolAsync.cc b/projects/hip-tests/catch/performance/stream/hipMallocFromPoolAsync.cc new file mode 100644 index 0000000000..a0de4bf908 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipMallocFromPoolAsync.cc @@ -0,0 +1,82 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "mem_pools_performance_common.hh" + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class MallocFromPoolAsyncBenchmark : public Benchmark { + public: + void operator()(const size_t array_size) { + const StreamGuard stream_guard{Streams::created}; + const hipStream_t stream = stream_guard.stream(); + + hipMemPool_t mem_pool{nullptr}; + hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone); + HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props)); + + float* array_ptr{nullptr}; + + TIMED_SECTION_STREAM(kTimerTypeEvent, stream) { + HIP_CHECK(hipMallocFromPoolAsync(&array_ptr, array_size * sizeof(float), mem_pool, stream)); + } + + REQUIRE(array_ptr != nullptr); + + HIP_CHECK(hipFreeAsync(array_ptr, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + HIP_CHECK(hipMemPoolDestroy(mem_pool)); + } +}; + +static void RunBenchmark(const size_t array_size) { + MallocFromPoolAsyncBenchmark benchmark; + benchmark.AddSectionName(std::to_string(array_size)); + benchmark.Run(array_size); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipMallocFromPoolAsync`: + * -# Allocation size: + * - 4 KB + * - 4 MB + * - 16 MB + * Test source + * ------------------------ + * - performance/stream/hipMallocFromPoolAsync.cc + * Test requirements + * ------------------------ + * - Device supports memory pools + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipMallocFromPoolAsync") { + if (!AreMemPoolsSupported(0)) { + HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported " + "attribute. Hence skipping the testing with Pass result.\n"); + return; + } + size_t array_size = GENERATE(4_KB, 4_MB, 16_MB); + RunBenchmark(array_size); +} diff --git a/projects/hip-tests/catch/performance/stream/hipMemPoolCreate.cc b/projects/hip-tests/catch/performance/stream/hipMemPoolCreate.cc new file mode 100644 index 0000000000..3e616b1991 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipMemPoolCreate.cc @@ -0,0 +1,71 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "mem_pools_performance_common.hh" + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class MemPoolCreateBenchmark : public Benchmark { + public: + void operator()() { + hipMemPool_t mem_pool{nullptr}; + hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone); + + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props)); + } + + REQUIRE(mem_pool != nullptr); + HIP_CHECK(hipMemPoolDestroy(mem_pool)); + } +}; + +static void RunBenchmark() { + MemPoolCreateBenchmark benchmark; + benchmark.Run(); +} + +/** + * @warning **MemPool APIs are not fully implemented within current version + * or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms. + * Therefore, all tests related to MemPool APIs are implemented without formal + * verification and will be verified once HIP fully supports MemPool APIs.** + * Test Description + * ------------------------ + * - Executes `hipMemPoolCreate`. + * Test source + * ------------------------ + * - performance/stream/hipMemPoolCreate.cc + * Test requirements + * ------------------------ + * - Device supports memory pools + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipMemPoolCreate") { + if (!AreMemPoolsSupported(0)) { + HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported " + "attribute. Hence skipping the testing with Pass result.\n"); + return; + } + RunBenchmark(); +} diff --git a/projects/hip-tests/catch/performance/stream/hipMemPoolDestroy.cc b/projects/hip-tests/catch/performance/stream/hipMemPoolDestroy.cc new file mode 100644 index 0000000000..0276ccc962 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipMemPoolDestroy.cc @@ -0,0 +1,70 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "mem_pools_performance_common.hh" + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class MemPoolDestroyBenchmark : public Benchmark { + public: + void operator()() { + hipMemPool_t mem_pool{nullptr}; + hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone); + HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props)); + + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipMemPoolDestroy(mem_pool)); + } + } +}; + +static void RunBenchmark() { + MemPoolDestroyBenchmark benchmark; + benchmark.Run(); +} + +/** + * @warning **MemPool APIs are not fully implemented within current version + * or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms. + * Therefore, all tests related to MemPool APIs are implemented without formal + * verification and will be verified once HIP fully supports MemPool APIs.** + * Test Description + * ------------------------ + * - Creates new mem pool. + * - Executes `hipMemPoolDestroy`. + * Test source + * ------------------------ + * - performance/stream/hipMemPoolDestroy.cc + * Test requirements + * ------------------------ + * - Device supports memory pools + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipMemPoolDestroy") { + if (!AreMemPoolsSupported(0)) { + HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported " + "attribute. Hence skipping the testing with Pass result.\n"); + return; + } + RunBenchmark(); +} diff --git a/projects/hip-tests/catch/performance/stream/hipMemPoolExportPointer.cc b/projects/hip-tests/catch/performance/stream/hipMemPoolExportPointer.cc new file mode 100644 index 0000000000..10960706d0 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipMemPoolExportPointer.cc @@ -0,0 +1,84 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "mem_pools_performance_common.hh" + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class MemPoolExportPointerBenchmark : public Benchmark { + public: + void operator()(const size_t array_size) { + float* device_ptr{nullptr}; + hipMemPool_t mem_pool{nullptr}; + hipMemPoolPtrExportData exp_data; + + hipMemPoolProps props = CreateMemPoolProps(0, kHandleType); + HIP_CHECK(hipMemPoolCreate(&mem_pool, &props)); + HIP_CHECK(hipMallocFromPoolAsync(&device_ptr, array_size * sizeof(float), mem_pool, nullptr)); + HIP_CHECK(hipStreamSynchronize(nullptr)); + + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipMemPoolExportPointer(&exp_data, device_ptr)); + } + + HIP_CHECK(hipFreeAsync(device_ptr, nullptr)); + HIP_CHECK(hipMemPoolDestroy(mem_pool)); + } +}; + +static void RunBenchmark(const size_t array_size) { + MemPoolExportPointerBenchmark benchmark; + benchmark.AddSectionName(std::to_string(array_size)); + benchmark.Run(array_size); +} + +/** + * @warning **MemPool APIs are not fully implemented within current version + * or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms. + * Therefore, all tests related to MemPool APIs are implemented without formal + * verification and will be verified once HIP fully supports MemPool APIs.** + * Test Description + * ------------------------ + * - Executes `hipMemPoolExportPointer`: + * -# Allocation size: + * - 4 KB + * - 4 MB + * - 16 MB + * - Uses the same process for import and export operations. + * Test source + * ------------------------ + * - performance/stream/hipMemPoolExportPointer.cc + * Test requirements + * ------------------------ + * - Device supports memory pools + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipMemPoolExportPointer") { + if (!AreMemPoolsSupported(0)) { + HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported " + "attribute. Hence skipping the testing with Pass result.\n"); + return; + } + size_t array_size = GENERATE(4_KB, 4_MB, 16_MB); + RunBenchmark(array_size); +} diff --git a/projects/hip-tests/catch/performance/stream/hipMemPoolExportToShareableHandle.cc b/projects/hip-tests/catch/performance/stream/hipMemPoolExportToShareableHandle.cc new file mode 100644 index 0000000000..9e93751403 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipMemPoolExportToShareableHandle.cc @@ -0,0 +1,74 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "mem_pools_performance_common.hh" + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class MemPoolExportToShareableHandleBenchmark : public Benchmark { + public: + void operator()() { + hipMemPool_t mem_pool{nullptr}; + int share_handle; + + hipMemPoolProps props = CreateMemPoolProps(0, kHandleType); + HIP_CHECK(hipMemPoolCreate(&mem_pool, &props)); + + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipMemPoolExportToShareableHandle(&share_handle, mem_pool, kHandleType, 0)); + } + + HIP_CHECK(hipMemPoolDestroy(mem_pool)); + } +}; + +static void RunBenchmark() { + MemPoolExportToShareableHandleBenchmark benchmark; + benchmark.Run(); +} + +/** + * @warning **MemPool APIs are not fully implemented within current version + * or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms. + * Therefore, all tests related to MemPool APIs are implemented without formal + * verification and will be verified once HIP fully supports MemPool APIs.** + * Test Description + * ------------------------ + * - Executes `hipMemPoolExportToShareableHandle`. + * - Uses the same process for import and export operations. + * Test source + * ------------------------ + * - performance/stream/hipMemPoolExportToShareableHandle.cc + * Test requirements + * ------------------------ + * - Device supports memory pools + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipMemPoolExportToShareableHandle") { + if (!AreMemPoolsSupported(0)) { + HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported " + "attribute. Hence skipping the testing with Pass result.\n"); + return; + } + RunBenchmark(); +} diff --git a/projects/hip-tests/catch/performance/stream/hipMemPoolGetAccess.cc b/projects/hip-tests/catch/performance/stream/hipMemPoolGetAccess.cc new file mode 100644 index 0000000000..c30b45acb6 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipMemPoolGetAccess.cc @@ -0,0 +1,76 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "mem_pools_performance_common.hh" + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class MemPoolGetAccessBenchmark : public Benchmark { + public: + void operator()() { + hipMemPool_t mem_pool{nullptr}; + hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone); + HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props)); + + hipMemAccessFlags flags = hipMemAccessFlagsProtNone; + hipMemLocation location = { + hipMemLocationTypeDevice, + 0 + }; + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipMemPoolGetAccess(&flags, mem_pool, location)); + } + + HIP_CHECK(hipMemPoolDestroy(mem_pool)); + } +}; + +static void RunBenchmark() { + MemPoolGetAccessBenchmark benchmark; + benchmark.Run(); +} + +/** + * @warning **MemPool APIs are not fully implemented within current version + * or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms. + * Therefore, all tests related to MemPool APIs are implemented without formal + * verification and will be verified once HIP fully supports MemPool APIs.** + * Test Description + * ------------------------ + * - Executes `hipMemPoolGetAccess`. + * Test source + * ------------------------ + * - performance/stream/hipMemPoolGetAccess.cc + * Test requirements + * ------------------------ + * - Device supports memory pools + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipMemPoolGetAccess") { + if (!AreMemPoolsSupported(0)) { + HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported " + "attribute. Hence skipping the testing with Pass result.\n"); + return; + } + RunBenchmark(); +} diff --git a/projects/hip-tests/catch/performance/stream/hipMemPoolGetAttribute.cc b/projects/hip-tests/catch/performance/stream/hipMemPoolGetAttribute.cc new file mode 100644 index 0000000000..d034fac57c --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipMemPoolGetAttribute.cc @@ -0,0 +1,83 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "mem_pools_performance_common.hh" + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class MemPoolGetAttributeBenchmark : public Benchmark { + public: + void operator()(const hipMemPoolAttr attribute) { + hipMemPool_t mem_pool{nullptr}; + hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone); + HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props)); + + uint64_t value{0}; + + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipMemPoolGetAttribute(mem_pool, attribute, &value)); + } + + HIP_CHECK(hipMemPoolDestroy(mem_pool)); + } +}; + +static void RunBenchmark(const hipMemPoolAttr attribute) { + MemPoolGetAttributeBenchmark benchmark; + benchmark.AddSectionName(GetMemPoolAttrSectionName(attribute)); + benchmark.Run(attribute); +} + +/** + * @warning **MemPool APIs are not fully implemented within current version + * or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms. + * Therefore, all tests related to MemPool APIs are implemented without formal + * verification and will be verified once HIP fully supports MemPool APIs.** + * Test Description + * ------------------------ + * - Executes `hipMemPoolGetAttribute`: + * -# Supported attributes: + * - `hipMemPoolAttrReleaseThreshold` + * - `hipMemPoolReuseFollowEventDependencies` + * - `hipMemPoolReuseAllowOpportunistic` + * - `hipMemPoolReuseAllowInternalDependencies` + * Test source + * ------------------------ + * - performance/stream/hipMemPoolGetAttribute.cc + * Test requirements + * ------------------------ + * - Device supports memory pools + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipMemPoolGetAttribute") { + if (!AreMemPoolsSupported(0)) { + HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported " + "attribute. Hence skipping the testing with Pass result.\n"); + return; + } + hipMemPoolAttr attribute = GENERATE(hipMemPoolAttrReleaseThreshold, + hipMemPoolReuseFollowEventDependencies, + hipMemPoolReuseAllowOpportunistic, + hipMemPoolReuseAllowInternalDependencies); + RunBenchmark(attribute); +} diff --git a/projects/hip-tests/catch/performance/stream/hipMemPoolImportFromShareableHandle.cc b/projects/hip-tests/catch/performance/stream/hipMemPoolImportFromShareableHandle.cc new file mode 100644 index 0000000000..fb37f6c951 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipMemPoolImportFromShareableHandle.cc @@ -0,0 +1,75 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "mem_pools_performance_common.hh" + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class MemPoolImportFromShareableHandleBenchmark : public Benchmark { + public: + void operator()() { + hipMemPool_t mem_pool{nullptr}; + int share_handle; + + hipMemPoolProps props = CreateMemPoolProps(0, kHandleType); + HIP_CHECK(hipMemPoolCreate(&mem_pool, &props)); + HIP_CHECK(hipMemPoolExportToShareableHandle(&share_handle, mem_pool, kHandleType, 0)); + + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipMemPoolImportFromShareableHandle(&mem_pool, &share_handle, kHandleType, 0)); + } + + HIP_CHECK(hipMemPoolDestroy(mem_pool)); + } +}; + +static void RunBenchmark() { + MemPoolImportFromShareableHandleBenchmark benchmark; + benchmark.Run(); +} + +/** + * @warning **MemPool APIs are not fully implemented within current version + * or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms. + * Therefore, all tests related to MemPool APIs are implemented without formal + * verification and will be verified once HIP fully supports MemPool APIs.** + * Test Description + * ------------------------ + * - Executes `hipMemPoolImportFromShareableHandle`. + * - Uses the same process for import and export operations. + * Test source + * ------------------------ + * - performance/stream/hipMemPoolImportFromShareableHandle.cc + * Test requirements + * ------------------------ + * - Device supports memory pools + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipMemPoolImportFromShareableHandle") { + if (!AreMemPoolsSupported(0)) { + HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported " + "attribute. Hence skipping the testing with Pass result.\n"); + return; + } + RunBenchmark(); +} diff --git a/projects/hip-tests/catch/performance/stream/hipMemPoolImportPointer.cc b/projects/hip-tests/catch/performance/stream/hipMemPoolImportPointer.cc new file mode 100644 index 0000000000..de439e7c93 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipMemPoolImportPointer.cc @@ -0,0 +1,87 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "mem_pools_performance_common.hh" + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class MemPoolImportPointerBenchmark : public Benchmark { + public: + void operator()(const size_t array_size) { + float* device_ptr{nullptr}; + float* device_ptr_import{nullptr}; + hipMemPool_t mem_pool{nullptr}; + hipMemPoolPtrExportData exp_data; + + hipMemPoolProps props = CreateMemPoolProps(0, kHandleType); + HIP_CHECK(hipMemPoolCreate(&mem_pool, &props)); + HIP_CHECK(hipMallocFromPoolAsync(&device_ptr, array_size * sizeof(float), mem_pool, nullptr)); + HIP_CHECK(hipStreamSynchronize(nullptr)); + HIP_CHECK(hipMemPoolExportPointer(&exp_data, device_ptr)); + + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipMemPoolImportPointer(reinterpret_cast(device_ptr_import), mem_pool, &exp_data)); + } + + HIP_CHECK(hipFree(device_ptr)); + HIP_CHECK(hipFree(device_ptr_import)); + HIP_CHECK(hipMemPoolDestroy(mem_pool)); + } +}; + +static void RunBenchmark(const size_t array_size) { + MemPoolImportPointerBenchmark benchmark; + benchmark.AddSectionName(std::to_string(array_size)); + benchmark.Run(array_size); +} + +/** + * @warning **MemPool APIs are not fully implemented within current version + * or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms. + * Therefore, all tests related to MemPool APIs are implemented without formal + * verification and will be verified once HIP fully supports MemPool APIs.** + * Test Description + * ------------------------ + * - Executes `hipMemPoolImportPointer`: + * -# Allocation size: + * - 4 KB + * - 4 MB + * - 16 MB + * - Uses the same process for import and export operations. + * Test source + * ------------------------ + * - performance/stream/hipMemPoolImportPointer.cc + * Test requirements + * ------------------------ + * - Device supports memory pools + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipMemPoolImportPointer") { + if (!AreMemPoolsSupported(0)) { + HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported " + "attribute. Hence skipping the testing with Pass result.\n"); + return; + } + size_t array_size = GENERATE(4_KB, 4_MB, 16_MB); + RunBenchmark(array_size); +} diff --git a/projects/hip-tests/catch/performance/stream/hipMemPoolSetAccess.cc b/projects/hip-tests/catch/performance/stream/hipMemPoolSetAccess.cc new file mode 100644 index 0000000000..7eadcb4568 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipMemPoolSetAccess.cc @@ -0,0 +1,79 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "mem_pools_performance_common.hh" + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class MemPoolSetAccessBenchmark : public Benchmark { + public: + void operator()() { + hipMemPool_t mem_pool{nullptr}; + hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone); + HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props)); + + hipMemAccessDesc desc_list = { + { + hipMemLocationTypeDevice, + 0 + }, + hipMemAccessFlagsProtReadWrite + }; + + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipMemPoolSetAccess(mem_pool, &desc_list, 1)); + } + + HIP_CHECK(hipMemPoolDestroy(mem_pool)); + } +}; + +static void RunBenchmark() { + MemPoolSetAccessBenchmark benchmark; + benchmark.Run(); +} + +/** + * @warning **MemPool APIs are not fully implemented within current version + * or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms. + * Therefore, all tests related to MemPool APIs are implemented without formal + * verification and will be verified once HIP fully supports MemPool APIs.** + * Test Description + * ------------------------ + * - Executes `hipMemPoolSetAccess` with `hipMemAccessFlagsProtReadWrite`. + * Test source + * ------------------------ + * - performance/stream/hipMemPoolSetAccess.cc + * Test requirements + * ------------------------ + * - Device supports memory pools + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipMemPoolSetAccess") { + if (!AreMemPoolsSupported(0)) { + HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported " + "attribute. Hence skipping the testing with Pass result.\n"); + return; + } + RunBenchmark(); +} diff --git a/projects/hip-tests/catch/performance/stream/hipMemPoolSetAttribute.cc b/projects/hip-tests/catch/performance/stream/hipMemPoolSetAttribute.cc new file mode 100644 index 0000000000..dbdc2ca152 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipMemPoolSetAttribute.cc @@ -0,0 +1,83 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "mem_pools_performance_common.hh" + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class MemPoolSetAttributeBenchmark : public Benchmark { + public: + void operator()(const hipMemPoolAttr attribute) { + hipMemPool_t mem_pool{nullptr}; + hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone); + HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props)); + + int value{0}; + + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipMemPoolSetAttribute(mem_pool, attribute, &value)); + } + + HIP_CHECK(hipMemPoolDestroy(mem_pool)); + } +}; + +static void RunBenchmark(const hipMemPoolAttr attribute) { + MemPoolSetAttributeBenchmark benchmark; + benchmark.AddSectionName(GetMemPoolAttrSectionName(attribute)); + benchmark.Run(attribute); +} + +/** + * @warning **MemPool APIs are not fully implemented within current version + * or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms. + * Therefore, all tests related to MemPool APIs are implemented without formal + * verification and will be verified once HIP fully supports MemPool APIs.** + * Test Description + * ------------------------ + * - Executes `hipMemPoolSetAttribute`: + * -# Supported attributes: + * - `hipMemPoolAttrReleaseThreshold` + * - `hipMemPoolReuseFollowEventDependencies` + * - `hipMemPoolReuseAllowOpportunistic` + * - `hipMemPoolReuseAllowInternalDependencies` + * Test source + * ------------------------ + * - performance/stream/hipMemPoolSetAttribute.cc + * Test requirements + * ------------------------ + * - Device supports memory pools + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipMemPoolSetAttribute") { + if (!AreMemPoolsSupported(0)) { + HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported " + "attribute. Hence skipping the testing with Pass result.\n"); + return; + } + hipMemPoolAttr attribute = GENERATE(hipMemPoolAttrReleaseThreshold, + hipMemPoolReuseFollowEventDependencies, + hipMemPoolReuseAllowOpportunistic, + hipMemPoolReuseAllowInternalDependencies); + RunBenchmark(attribute); +} diff --git a/projects/hip-tests/catch/performance/stream/hipMemPoolTrimTo.cc b/projects/hip-tests/catch/performance/stream/hipMemPoolTrimTo.cc new file mode 100644 index 0000000000..e3fc5b6efb --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipMemPoolTrimTo.cc @@ -0,0 +1,77 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "mem_pools_performance_common.hh" + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class MemPoolTrimToBenchmark : public Benchmark { + public: + void operator()(const size_t min_bytes_to_hold) { + hipMemPool_t mem_pool{nullptr}; + hipMemPoolProps pool_props = CreateMemPoolProps(0, hipMemHandleTypeNone); + HIP_CHECK(hipMemPoolCreate(&mem_pool, &pool_props)); + + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipMemPoolTrimTo(mem_pool, min_bytes_to_hold)); + } + + HIP_CHECK(hipMemPoolDestroy(mem_pool)); + } +}; + +static void RunBenchmark(const size_t min_bytes_to_hold) { + MemPoolTrimToBenchmark benchmark; + benchmark.AddSectionName(std::to_string(min_bytes_to_hold)); + benchmark.Run(min_bytes_to_hold); +} + +/** + * @warning **MemPool APIs are not fully implemented within current version + * or HIP and therefore they cannot be appropriately executed on AMD and NVIDIA platforms. + * Therefore, all tests related to MemPool APIs are implemented without formal + * verification and will be verified once HIP fully supports MemPool APIs.** + * Test Description + * ------------------------ + * - Executes `hipMemPoolTrimTo`: + * -# Minimum bytes to hold: + * - 4 KB + * - 4 MB + * - 16 MB + * Test source + * ------------------------ + * - performance/stream/hipMemPoolTrimTo.cc + * Test requirements + * ------------------------ + * - Device supports memory pools + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipMemPoolTrimTo") { + if (!AreMemPoolsSupported(0)) { + HipTest::HIP_SKIP_TEST("GPU 0 doesn't support hipDeviceAttributeMemoryPoolsSupported " + "attribute. Hence skipping the testing with Pass result.\n"); + return; + } + size_t min_bytes_to_hold = GENERATE(4_KB, 4_MB, 16_MB); + RunBenchmark(min_bytes_to_hold); +} diff --git a/projects/hip-tests/catch/performance/stream/hipStreamAddCallback.cc b/projects/hip-tests/catch/performance/stream/hipStreamAddCallback.cc new file mode 100644 index 0000000000..771d0ee43e --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipStreamAddCallback.cc @@ -0,0 +1,61 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +void Callback(hipStream_t stream, hipError_t status, void* user_data) {} + +class StreamAddCallbackBenchmark : public Benchmark { + public: + void operator()() { + const StreamGuard stream_guard{Streams::created}; + const hipStream_t stream = stream_guard.stream(); + + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipStreamAddCallback(stream, Callback, nullptr, 0)); + } + } +}; + +static void RunBenchmark() { + StreamAddCallbackBenchmark benchmark; + benchmark.Run(); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipStreamAddCallback` on the created stream. + * Test source + * ------------------------ + * - performance/stream/hipStreamAddCallback.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipStreamAddCallback") { + RunBenchmark(); +} diff --git a/projects/hip-tests/catch/performance/stream/hipStreamBasic.cc b/projects/hip-tests/catch/performance/stream/hipStreamBasic.cc new file mode 100644 index 0000000000..ffa95b00d7 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipStreamBasic.cc @@ -0,0 +1,269 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + * Contains performance tests for all hipStream related APIs + */ + + class HipDeviceGetStreamPriorityRangeBenchmark : public Benchmark { + public: + void operator()() { + int priority_min, priority_max; + TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipDeviceGetStreamPriorityRange(&priority_min, &priority_max)); } + } +}; + +class HipStreamQueryBenchmark : public Benchmark { + public: + void operator()(bool perform_work) { + hipError_t error; + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + void *dptr; + + if(perform_work) { + HIP_CHECK(hipMallocAsync(&dptr, 2048 * 4, stream)); + } + + TIMED_SECTION(kTimerTypeCpu) { error = hipStreamQuery(stream); } + + if(perform_work) { + HIP_CHECK(hipFreeAsync(dptr, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + } + + HIP_CHECK(hipStreamDestroy(stream)); + } +}; + +class HipStreamSynchronizeBenchmark : public Benchmark { + public: + void operator()() { + hipError_t error; + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + + TIMED_SECTION(kTimerTypeCpu) { error = hipStreamSynchronize(stream); } + + HIP_CHECK(hipStreamDestroy(stream)); + } +}; + +class HipStreamDestroyBenchmark : public Benchmark { + public: + void operator()() { + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + + TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamDestroy(stream)); } + } +}; + +class HipStreamCreateBenchmark : public Benchmark { + public: + void operator()() { + hipStream_t stream; + + TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamCreate(&stream)); } + + HIP_CHECK(hipStreamDestroy(stream)); + } +}; + +class HipStreamCreateWithPriorityBenchmark : public Benchmark { + public: + void operator()(unsigned int flag) { + hipStream_t stream; + int priority_min, priority_max, priority_mid; + + HIP_CHECK(hipDeviceGetStreamPriorityRange(&priority_min, &priority_max)); + priority_mid = (priority_max + priority_min) / 2; + + TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamCreateWithPriority(&stream, flag, priority_mid)); } + + HIP_CHECK(hipStreamDestroy(stream)); + } +}; + + + +static std::string GetStreamCreateFlagName(unsigned flag) { + switch (flag) { + case hipStreamDefault: + return "hipStreamDefault"; + case hipStreamNonBlocking: + return "hipStreamNonBlocking"; + default: + return "flag combination"; + } +} + +class HipStreamCreateWithFlagsBenchmark : public Benchmark { + public: + void operator()(unsigned int flag) { + hipStream_t stream; + + TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamCreateWithFlags(&stream, flag)); } + + HIP_CHECK(hipStreamDestroy(stream)); + } +}; + +/** + * Test Description + * ------------------------ + * - Executes `hipStreamCreate`: + * Test source + * ------------------------ + * - performance/stream/hipStreamBasic.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipStreamCreate") { + HipStreamCreateBenchmark benchmark; + benchmark.Run(); +} + +static void RunBenchmark(unsigned flag) { + HipStreamCreateWithFlagsBenchmark benchmark; + benchmark.AddSectionName(GetStreamCreateFlagName(flag)); + benchmark.Run(flag); +} + +static void RunBenchmarkWithPriority(unsigned flag) { + HipStreamCreateWithPriorityBenchmark benchmark; + benchmark.AddSectionName(GetStreamCreateFlagName(flag)); + benchmark.Run(flag); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipStreamCreateWithFlags` with all flags: + * -# Flags + * - hipStreamDefault + * - hipStreamNonBlocking + * Test source + * ------------------------ + * - performance/stream/hipStreamBasic.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipStreamCreateWithFlags") { + const auto flag = GENERATE(hipStreamDefault, hipStreamNonBlocking); + RunBenchmark(flag); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipStreamCreateWithPriority` with all flags: + * -# Flags + * - hipStreamDefault + * - hipStreamNonBlocking + * Test source + * ------------------------ + * - performance/stream/hipStreamBasic.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipStreamCreateWithPriority") { + const auto flag = GENERATE(hipStreamDefault, hipStreamNonBlocking); + RunBenchmarkWithPriority(flag); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipStreamDestroy`: + * Test source + * ------------------------ + * - performance/stream/hipStreamBasic.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipStreamDestroy") { + HipStreamDestroyBenchmark benchmark; + benchmark.Run(); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipDeviceGetStreamPriorityRange`: + * Test source + * ------------------------ + * - performance/stream/hipStreamBasic.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipDeviceGetStreamPriorityRange") { + HipDeviceGetStreamPriorityRangeBenchmark benchmark; + benchmark.Run(); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipStreamQuery`: + * Test source + * ------------------------ + * - performance/stream/hipStreamBasic.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipStreamQuery") { + const auto perform_work = GENERATE(true, false); + HipStreamQueryBenchmark benchmark; + if(perform_work) { + benchmark.AddSectionName("stream with work"); + } else { + benchmark.AddSectionName("stream without work"); + } + benchmark.Run(perform_work); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipDeviceGetStreamPriorityRange`: + * Test source + * ------------------------ + * - performance/stream/hipStreamBasic.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipStreamSynchronize") { + HipStreamSynchronizeBenchmark benchmark; + benchmark.Run(); +} diff --git a/projects/hip-tests/catch/performance/stream/hipStreamGetFlags.cc b/projects/hip-tests/catch/performance/stream/hipStreamGetFlags.cc new file mode 100644 index 0000000000..ffead22f39 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipStreamGetFlags.cc @@ -0,0 +1,75 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class StreamGetFlagsBenchmark : public Benchmark { + public: + void operator()(unsigned int expected_flag) { + unsigned int returned_flags{}; + hipStream_t stream; + + HIP_CHECK(hipStreamCreateWithFlags(&stream, expected_flag)); + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipStreamGetFlags(stream, &returned_flags)) + } + HIP_CHECK(hipStreamDestroy(stream)); + } +}; + +static void RunBenchmark(unsigned int expected_flag) { + StreamGetFlagsBenchmark benchmark; + switch (expected_flag) { + case hipStreamDefault: + benchmark.AddSectionName("hipStreamDefault"); + break; + case hipStreamNonBlocking: + benchmark.AddSectionName("hipStreamNonBlocking"); + break; + default: + benchmark.AddSectionName("unknown flag type"); + } + benchmark.Run(expected_flag); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipStreamGetFlags`: + * -# Flags: + * - `hipStreamDefault` + * - `hipStreamNonBlocking` + * Test source + * ------------------------ + * - performance/stream/hipStreamGetFlags.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipStreamGetFlags") { + unsigned int expected_flag = GENERATE(hipStreamDefault, hipStreamNonBlocking); + RunBenchmark(expected_flag); +} diff --git a/projects/hip-tests/catch/performance/stream/hipStreamGetPriority.cc b/projects/hip-tests/catch/performance/stream/hipStreamGetPriority.cc new file mode 100644 index 0000000000..993471a251 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipStreamGetPriority.cc @@ -0,0 +1,74 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class StreamGetPriorityBenchmark : public Benchmark { + public: + void operator()(Streams stream_type) { + const StreamGuard stream_guard{stream_type}; + const hipStream_t stream = stream_guard.stream(); + + int priority{}; + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipStreamGetPriority(stream, &priority)); + } + } +}; + +static void RunBenchmark(Streams stream_type) { + StreamGetPriorityBenchmark benchmark; + switch (stream_type) { + case Streams::nullstream: + benchmark.AddSectionName("null stream"); + break; + case Streams::created: + benchmark.AddSectionName("created"); + break; + default: + benchmark.AddSectionName("per thread stream"); + } + benchmark.Run(stream_type); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipStreamGetPriority`: + * -# Stream types: + * - `null` + * - created + * Test source + * ------------------------ + * - performance/stream/hipStreamGetPriority.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipStreamGetPriority") { + Streams stream_type = GENERATE(Streams::nullstream, Streams::created); + RunBenchmark(stream_type); +} \ No newline at end of file diff --git a/projects/hip-tests/catch/performance/stream/hipStreamWaitEvent.cc b/projects/hip-tests/catch/performance/stream/hipStreamWaitEvent.cc new file mode 100644 index 0000000000..6810b5b619 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipStreamWaitEvent.cc @@ -0,0 +1,80 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +class StreamWaitEventBenchmark : public Benchmark { + public: + void operator()(Streams stream_type) { + const StreamGuard stream_guard{stream_type}; + const hipStream_t stream = stream_guard.stream(); + hipEvent_t wait_event{nullptr}; + + HIP_CHECK(hipEventCreate(&wait_event)); + REQUIRE(wait_event != nullptr); + HIP_CHECK(hipEventRecord(wait_event, stream)); + + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipStreamWaitEvent(stream, wait_event, 0)); + HIP_CHECK(hipStreamSynchronize(stream)); + } + HIP_CHECK(hipEventDestroy(wait_event)); + } +}; + +static void RunBenchmark(Streams stream_type) { + StreamWaitEventBenchmark benchmark{}; + switch (stream_type) { + case Streams::nullstream: + benchmark.AddSectionName("null stream"); + break; + case Streams::created: + benchmark.AddSectionName("created"); + break; + default: + benchmark.AddSectionName("per thread stream"); + } + benchmark.Run(stream_type); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipStreamWaitEvent`: + * -# Stream types: + * - `null` + * - created + * Test source + * ------------------------ + * - performance/stream/hipStreamWaitEvent.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipStreamWaitEvent") { + Streams stream_type = GENERATE(Streams::nullstream, Streams::created); + RunBenchmark(stream_type); +} diff --git a/projects/hip-tests/catch/performance/stream/hipStreamWaitValue.cc b/projects/hip-tests/catch/performance/stream/hipStreamWaitValue.cc new file mode 100644 index 0000000000..5d140d01fb --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipStreamWaitValue.cc @@ -0,0 +1,172 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +static int IsStreamWaitValueSupported(int device_id) { + int wait_value_supported = 0; +#if HT_AMD + HIP_CHECK(hipDeviceGetAttribute(&wait_value_supported, hipDeviceAttributeCanUseStreamWaitValue, + device_id)); +#else + cuDeviceGetAttribute(&wait_value_supported, CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS, + device_id); +#endif + return wait_value_supported; +} + +class StreamWaitValue32Benchmark : public Benchmark { + public: + void operator()(const size_t array_size, unsigned int flag) { + const StreamGuard stream_guard{Streams::created}; + const hipStream_t stream = stream_guard.stream(); + uint32_t* value_ptr; + uint32_t value{0}; + if (flag == hipStreamWaitValueAnd) { + value = 1; + } + HIP_CHECK(hipMalloc(&value_ptr, sizeof(uint32_t) * array_size)); + HIP_CHECK(hipMemset(value_ptr, value, sizeof(uint32_t) * array_size)); + + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipStreamWaitValue32(stream, value_ptr, value, flag)); + } + HIP_CHECK(hipFree(value_ptr)); + } +}; + +class StreamWaitValue64Benchmark : public Benchmark { + public: + void operator()(const size_t array_size, unsigned int flag) { + const StreamGuard stream_guard{Streams::created}; + const hipStream_t stream = stream_guard.stream(); + uint64_t* value_ptr; + uint64_t value{0}; + if (flag == hipStreamWaitValueAnd) { + value = 1; + } + HIP_CHECK(hipMalloc(&value_ptr, sizeof(uint64_t) * array_size)); + HIP_CHECK(hipMemset(value_ptr, value, sizeof(uint64_t) * array_size)); + + TIMED_SECTION(kTimerTypeCpu) { + HIP_CHECK(hipStreamWaitValue64(stream, value_ptr, value, flag)); + } + HIP_CHECK(hipFree(value_ptr)); + } +}; + +template +static void RunBenchmark(const size_t array_size, unsigned int flag) { + WaitValueBenchmark benchmark; + benchmark.AddSectionName(std::to_string(array_size)); + switch (flag) { + case hipStreamWaitValueGte: + benchmark.AddSectionName("greater than or equal"); + break; + case hipStreamWaitValueEq: + benchmark.AddSectionName("equal"); + break; + case hipStreamWaitValueAnd: + benchmark.AddSectionName("logical and"); + break; + case hipStreamWaitValueNor: + benchmark.AddSectionName("logical nor"); + break; + default: + benchmark.AddSectionName("unknown flag"); + } + benchmark.Run(array_size, flag); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipStreamWaitValue32` for different array sizes: + * -# 4 KB + * -# 4 MB + * -# 16 MB + * - Uses different flag types for wait criteria: + * -# Greater than or equal + * -# Equal + * -# Logical AND + * -# Logical OR + * Test source + * ------------------------ + * - performance/stream/hipStreamWaitValue.cc + * Test requirements + * ------------------------ + * - Device supports Stream Wait Value operations + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipStreamWaitValue32") { +#if HT_AMD + if (!IsStreamWaitValueSupported(0)) { + HipTest::HIP_SKIP_TEST( + "GPU 0 doesn't support hipStreamWaitValue32() function. " + "Hence skipping the testing with Pass result.\n"); + return; + } + + size_t array_size = GENERATE(4_KB, 4_MB, 16_MB); + unsigned int flag = GENERATE(hipStreamWaitValueGte, hipStreamWaitValueEq, hipStreamWaitValueAnd, + hipStreamWaitValueNor); + RunBenchmark(array_size, flag); +#endif +} + +/** + * Test Description + * ------------------------ + * - Executes `hipStreamWaitValue64`: + * -# Allocation size: + * - 4 KB + * - 4 MB + * - 16 MB + * -# Wait type: + * - Greater than or equal + * - Equal + * - Logical AND + * - Logical OR + * Test source + * ------------------------ + * - performance/stream/hipStreamWaitValue.cc + * Test requirements + * ------------------------ + * - Device supports Stream Wait Value operations + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipStreamWaitValue64") { + if (!IsStreamWaitValueSupported(0)) { + HipTest::HIP_SKIP_TEST( + "GPU 0 doesn't support hipStreamWaitValue64() function. " + "Hence skipping the testing with Pass result.\n"); + return; + } + size_t array_size = GENERATE(4_KB, 4_MB, 16_MB); + unsigned int flag = GENERATE(hipStreamWaitValueGte, hipStreamWaitValueEq, hipStreamWaitValueAnd, + hipStreamWaitValueNor); + RunBenchmark(array_size, flag); +} diff --git a/projects/hip-tests/catch/performance/stream/hipStreamWriteValue.cc b/projects/hip-tests/catch/performance/stream/hipStreamWriteValue.cc new file mode 100644 index 0000000000..0fb1060be7 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/hipStreamWriteValue.cc @@ -0,0 +1,123 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +/** + * @addtogroup stream stream + * @{ + * @ingroup PerformanceTest + */ + +#if HT_NVIDIA +static int IsStreamWriteValueSupported(int device_id) { + int write_value_supported = 0; + + cuDeviceGetAttribute(&write_value_supported, CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS, + device_id); + return write_value_supported; +} +#endif + +class StreamWriteValue32Benchmark : public Benchmark { + public: + void operator()(const size_t array_size) { + const StreamGuard stream_guard{Streams::created}; + const hipStream_t stream = stream_guard.stream(); + uint32_t* value_ptr; + uint32_t value{0}; + HIP_CHECK(hipMalloc(&value_ptr, sizeof(uint32_t) * array_size)); + HIP_CHECK(hipMemset(value_ptr, value, sizeof(uint32_t) * array_size)); + + TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamWriteValue32(stream, value_ptr, value, 0)); } + HIP_CHECK(hipFree(value_ptr)); + } +}; + +class StreamWriteValue64Benchmark : public Benchmark { + public: + void operator()(const size_t array_size) { + const StreamGuard stream_guard{Streams::created}; + const hipStream_t stream = stream_guard.stream(); + uint64_t* value_ptr; + uint64_t value{0}; + HIP_CHECK(hipMalloc(&value_ptr, sizeof(uint64_t) * array_size)); + HIP_CHECK(hipMemset(value_ptr, value, sizeof(uint64_t) * array_size)); + + TIMED_SECTION(kTimerTypeCpu) { HIP_CHECK(hipStreamWriteValue64(stream, value_ptr, value, 0)); } + HIP_CHECK(hipFree(value_ptr)); + } +}; + +template static void RunBenchmark(const size_t array_size) { + WriteValueBenchmark benchmark; + benchmark.AddSectionName(std::to_string(array_size)); + benchmark.Run(array_size); +} + +/** + * Test Description + * ------------------------ + * - Executes `hipStreamWriteValue32`: + * -# Allocation size: + * - 4 KB + * - 4 MB + * - 16 MB + * Test source + * ------------------------ + * - performance/stream/hipStreamWriteValue.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipStreamWriteValue32") { +#if HT_AMD + size_t array_size = GENERATE(4_KB, 4_MB, 16_MB); + RunBenchmark(array_size); +#endif +} + +/** + * Test Description + * ------------------------ + * - Executes `hipStreamWriteValue64`: + * -# Allocation size: + * - 4 KB + * - 4 MB + * - 16 MB + * Test source + * ------------------------ + * - performance/stream/hipStreamWriteValue.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Performance_hipStreamWriteValue64") { +#if HT_NVIDIA + if (!IsStreamWriteValueSupported(0)) { + HipTest::HIP_SKIP_TEST( + "GPU 0 doesn't support hipStreamWriteValue64() function. " + "Hence skipping the testing with Pass result.\n"); + return; + } +#endif + size_t array_size = GENERATE(4_KB, 4_MB, 16_MB); + RunBenchmark(array_size); +} diff --git a/projects/hip-tests/catch/performance/stream/mem_pools_performance_common.hh b/projects/hip-tests/catch/performance/stream/mem_pools_performance_common.hh new file mode 100644 index 0000000000..b6f6f4b489 --- /dev/null +++ b/projects/hip-tests/catch/performance/stream/mem_pools_performance_common.hh @@ -0,0 +1,74 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include +#include + +#if __linux__ + static const hipMemAllocationHandleType kHandleType = hipMemHandleTypePosixFileDescriptor; +#else + static const hipMemAllocationHandleType kHandleType = hipMemHandleTypeWin32; +#endif + +static int AreMemPoolsSupported(int device_id) { + int mem_pools_supported = 0; + HIP_CHECK(hipDeviceGetAttribute(&mem_pools_supported, + hipDeviceAttributeMemoryPoolsSupported, 0)); + return mem_pools_supported; +} + +static hipMemPoolProps CreateMemPoolProps(const int device_id, const hipMemAllocationHandleType handle_type) { + hipMemPoolProps kPoolProps = { + hipMemAllocationTypePinned, + handle_type, + { + hipMemLocationTypeDevice, + device_id + }, + nullptr, + {0} + }; + + return kPoolProps; +} + +static std::string GetMemPoolAttrSectionName(const hipMemPoolAttr attribute) { + switch (attribute) { + case hipMemPoolReuseFollowEventDependencies: + return "ReuseFollowEventDependencies"; + case hipMemPoolReuseAllowOpportunistic: + return "ReuseAllowOpportunistic"; + case hipMemPoolReuseAllowInternalDependencies: + return "ReuseAllowInternalDependencies"; + case hipMemPoolAttrReleaseThreshold: + return "AttrReleaseThreshold"; + case hipMemPoolAttrReservedMemCurrent: + return "AttrReservedMemCurrent"; + case hipMemPoolAttrReservedMemHigh: + return "AttrReservedMemHigh"; + case hipMemPoolAttrUsedMemCurrent: + return "AttrUsedMemCurrent"; + case hipMemPoolAttrUsedMemHigh: + return "AttrUsedMemHigh"; + default: + return "unknown attribute"; + } +} diff --git a/projects/hip-tests/catch/unit/CMakeLists.txt b/projects/hip-tests/catch/unit/CMakeLists.txt index 2b086edcc6..5543643f96 100644 --- a/projects/hip-tests/catch/unit/CMakeLists.txt +++ b/projects/hip-tests/catch/unit/CMakeLists.txt @@ -36,11 +36,14 @@ add_subdirectory(compiler) add_subdirectory(errorHandling) add_subdirectory(cooperativeGrps) add_subdirectory(context) +add_subdirectory(warp) add_subdirectory(dynamicLoading) add_subdirectory(g++) add_subdirectory(module) add_subdirectory(channelDescriptor) add_subdirectory(executionControl) +add_subdirectory(vector_types) +add_subdirectory(atomics) add_subdirectory(p2p) add_subdirectory(gcc) @@ -49,5 +52,5 @@ add_subdirectory(callback) add_subdirectory(clock) # Vulkan interop APIs currently undefined for Nvidia add_subdirectory(vulkan_interop) +add_subdirectory(gl_interop) # Disabled on NVIDIA due to defect - EXSWHTEC-246 endif() -add_subdirectory(vector_types) diff --git a/projects/hip-tests/catch/unit/atomics/CMakeLists.txt b/projects/hip-tests/catch/unit/atomics/CMakeLists.txt new file mode 100644 index 0000000000..d8066a2f1a --- /dev/null +++ b/projects/hip-tests/catch/unit/atomics/CMakeLists.txt @@ -0,0 +1,48 @@ +# Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +set(TEST_SRC + atomicExch.cc + atomicExch_system.cc +) + +if(HIP_PLATFORM MATCHES "nvidia") + set_source_files_properties(atomicExch_system.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") + hip_add_exe_to_target(NAME AtomicsTest + TEST_SRC ${TEST_SRC} + TEST_TARGET_NAME build_tests + LINKER_LIBS "nvrtc -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") +elseif(HIP_PLATFORM MATCHES "amd") + hip_add_exe_to_target(NAME AtomicsTest + TEST_SRC ${TEST_SRC} + TEST_TARGET_NAME build_tests + LINKER_LIBS hiprtc) +endif() + +# SWDEV-435667: Below 2 tests failed in stress test on 01/12/23 +#add_test(NAME Unit_atomicExch_Negative_Parameters +# COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py +# ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH} +# atomicExch_negative_kernels.cc 40) +# +#add_test(NAME Unit_atomicExch_system_Negative_Parameters +# COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/../compileAndCaptureOutput.py +# ${CMAKE_CURRENT_SOURCE_DIR} ${HIP_PLATFORM} ${HIP_PATH} +# atomicExch_system_negative_kernels.cc 40) diff --git a/projects/hip-tests/catch/unit/atomics/atomicExch.cc b/projects/hip-tests/catch/unit/atomics/atomicExch.cc new file mode 100644 index 0000000000..47e5cc6ba9 --- /dev/null +++ b/projects/hip-tests/catch/unit/atomics/atomicExch.cc @@ -0,0 +1,213 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "atomicExch_common.hh" +#include "atomicExch_negative_kernels_rtc.hh" + +/** + * @addtogroup atomicExch atomicExch + * @{ + * @ingroup AtomicsTest + */ + +/** + * Test Description + * ------------------------ + * - Executes a kernel wherein all threads will perform an atomic exchange in the same(compile + * time deducible) memory location. Each thread will exchange its own grid wide linear index + 1 + * into the memory location, storing the return value into a separate output array slot + * corresponding to it. Once complete, the union of output array and exchange memory is validated to + * contain all values in the range [0, number_of_threads]. + * + * - The test is run for: + * - All overloads of atomicExch + * - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory + * - Exchange memory located in shared memory + * - Several grid and block dimension combinations(only one block is used for shared memory) + * Test source + * ------------------------ + * - unit/atomics/atomicExch.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +#if HT_NVIDIA +TEMPLATE_TEST_CASE("Unit_atomicExch_Positive_Same_Address_Compile_Time", "", int, unsigned int, + unsigned long long, float) { +#else +TEMPLATE_TEST_CASE("Unit_atomicExch_Positive_Same_Address_Compile_Time", "", int, unsigned int, + unsigned long, unsigned long long, float, double) { +#endif // HT_NVIDIA + for (auto current = 0; current < cmd_options.iterations; ++current) { + AtomicExchSameAddressTest(); + } +} + +/** + * Test Description + * ------------------------ + * - Executes a single kernel on a single device wherein all threads will perform an atomic + * exchange into a runtime determined memory location. Each thread will exchange its own grid wide + * linear index + offset into the memory location, storing the return value into a separate output + * array slot corresponding to it. Once complete, the union of output array and exchange memory is + * validated to contain all values in the range [0, number_of_threads + + * number_of_exchange_memory_slots). Several memory access patterns are tested: + * -# All threads exchange to a single memory location + * -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size + * for indexing + * -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes. + * + * - The test is run for: + * - All overloads of atomicExch + * - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory + * - Exchange memory located in shared memory + * - Several grid and block dimension combinations(only one block is used for shared memory) + * Test source + * ------------------------ + * - unit/atomics/atomicExch.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +#if HT_NVIDIA +TEMPLATE_TEST_CASE("Unit_atomicExch_Positive", "", int, unsigned int, + unsigned long long, float) { +#else +TEMPLATE_TEST_CASE("Unit_atomicExch_Positive", "", int, unsigned int, unsigned long, + unsigned long long, float, double) { +#endif // HT_NVIDIA + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + const auto cache_line_size = 128u; + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Same address " << current) { + AtomicExchSingleDeviceSingleKernelTest(1, sizeof(TestType)); + } + + DYNAMIC_SECTION("Adjacent addresses " << current) { + AtomicExchSingleDeviceSingleKernelTest(warp_size, + sizeof(TestType)); + } + + DYNAMIC_SECTION("Scattered addresses " << current) { + AtomicExchSingleDeviceSingleKernelTest(warp_size, + cache_line_size); + } + } +} + +/** + * Test Description + * ------------------------ + * - Executes a kernel two times concurrently on a single device wherein all threads will perform + * an atomic exchange into a runtime determined memory location. Each thread will exchange its own + * grid wide linear index + offset into the memory location, storing the return value into a + * separate output array slot corresponding to it. Once complete, the union of output array and + * exchange memory is validated to contain all values in the range [0, number_of_threads + + * number_of_exchange_memory_slots). Several memory access patterns are tested: + * -# All threads exchange to a single memory location + * -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size + * for indexing + * -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes. + * + * - The test is run for: + * - All overloads of atomicExch + * - hipMalloc, hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory + * - Several grid and block dimension combinations + * Test source + * ------------------------ + * - unit/atomics/atomicExch.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +#if HT_NVIDIA +TEMPLATE_TEST_CASE("Unit_atomicExch_Positive_Multi_Kernel", "", int, unsigned int, + unsigned long long, float) { +#else +TEMPLATE_TEST_CASE("Unit_atomicExch_Positive_Multi_Kernel", "", int, unsigned int, unsigned long, + unsigned long long, float, double) { +#endif // HT_NVIDIA + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + const auto cache_line_size = 128u; + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Same address " << current) { + AtomicExchSingleDeviceMultipleKernelTest(2, 1, + sizeof(TestType)); + } + + DYNAMIC_SECTION("Adjacent addresses " << current) { + AtomicExchSingleDeviceMultipleKernelTest(2, warp_size, + sizeof(TestType)); + } + + DYNAMIC_SECTION("Scattered addresses " << current) { + AtomicExchSingleDeviceMultipleKernelTest(2, warp_size, + cache_line_size); + } + } +} + +/** + * Test Description + * ------------------------ + * - RTCs kernels that pass combinations of arguments of invalid types for all overloads of + * atomicExch + * Test source + * ------------------------ + * - unit/atomics/atomicExch.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_atomicExch_Negative_Parameters_RTC") { + hiprtcProgram program{}; + + const auto program_source = GENERATE(kAtomicExchInt, kAtomicExchUnsignedInt, kAtomicExchULL, + kAtomicExchFloat, kAtomicExchDouble); + HIPRTC_CHECK( + hiprtcCreateProgram(&program, program_source, "atomicExch_negative.cc", 0, nullptr, nullptr)); + hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)}; + + // Get the compile log and count compiler error messages + size_t log_size{}; + HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size)); + std::string log(log_size, ' '); + HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data())); + int error_count{0}; + + int expected_error_count{8}; + std::string error_message{"error:"}; + + size_t n_pos = log.find(error_message, 0); + while (n_pos != std::string::npos) { + ++error_count; + n_pos = log.find(error_message, n_pos + 1); + } + + HIPRTC_CHECK(hiprtcDestroyProgram(&program)); + HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION); + REQUIRE(error_count == expected_error_count); +} diff --git a/projects/hip-tests/catch/unit/atomics/atomicExch_common.hh b/projects/hip-tests/catch/unit/atomics/atomicExch_common.hh new file mode 100644 index 0000000000..1b4add5253 --- /dev/null +++ b/projects/hip-tests/catch/unit/atomics/atomicExch_common.hh @@ -0,0 +1,381 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include + +#include +#include +#include +#include + +enum class AtomicScopes { device, system }; + +template __device__ T perform_atomic_exch(T* address, T val) { + if constexpr (scope == AtomicScopes::device) { + return atomicExch(address, val); + } else if (scope == AtomicScopes::system) { + return atomicExch_system(address, val); + } +} + +template +__global__ void atomic_exch_kernel_compile_time(T* const global_mem, T* const old_vals) { + __shared__ T shared_mem; + + const auto tid = cooperative_groups::this_grid().thread_rank(); + + T* const mem = use_shared_mem ? &shared_mem : global_mem; + + if constexpr (use_shared_mem) { + if (tid == 0) mem[0] = global_mem[0]; + __syncthreads(); + } + + old_vals[tid] = perform_atomic_exch(mem, static_cast(tid + 1)); + + if constexpr (use_shared_mem) { + __syncthreads(); + if (tid == 0) global_mem[0] = mem[0]; + } +} + +template +__host__ __device__ T* pitched_offset(T* const ptr, const unsigned int pitch, + const unsigned int idx) { + const auto byte_ptr = reinterpret_cast(ptr); + return reinterpret_cast(byte_ptr + idx * pitch); +} + +template +__global__ void atomic_exch_kernel(T* const global_mem, T* const old_vals, const unsigned int width, + const unsigned pitch, const T base_val = 0) { + extern __shared__ uint8_t shared_mem[]; + + const auto tid = cooperative_groups::this_grid().thread_rank(); + + T* const mem = use_shared_mem ? reinterpret_cast(shared_mem) : global_mem; + + if constexpr (use_shared_mem) { + if (tid < width) { + const auto target = pitched_offset(mem, pitch, tid); + *target = *pitched_offset(global_mem, pitch, tid); + }; + __syncthreads(); + } + + old_vals[tid] = perform_atomic_exch(pitched_offset(mem, pitch, tid % width), + base_val + static_cast(tid + width)); + + if constexpr (use_shared_mem) { + __syncthreads(); + if (tid < width) { + const auto target = pitched_offset(global_mem, pitch, tid); + *target = *pitched_offset(mem, pitch, tid); + }; + } +} + + +template +void AtomicExchSameAddress(const dim3 blocks, const dim3 threads, const LinearAllocs alloc_type) { + LinearAllocGuard mem_dev(alloc_type, sizeof(TestType)); + + const auto thread_count = blocks.x * blocks.y * blocks.z * threads.x * threads.y * threads.z; + const auto old_vals_alloc_size = thread_count * sizeof(TestType); + LinearAllocGuard old_vals_dev(LinearAllocs::hipMalloc, old_vals_alloc_size); + std::vector old_vals(thread_count + 1); + + + HIP_CHECK(hipMemset(mem_dev.ptr(), 0, sizeof(TestType))); + atomic_exch_kernel_compile_time + <<>>(mem_dev.ptr(), old_vals_dev.ptr()); + HIP_CHECK( + hipMemcpy(old_vals.data(), old_vals_dev.ptr(), old_vals_alloc_size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(old_vals.data() + thread_count, mem_dev.ptr(), sizeof(TestType), + hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); + + // Every thread will exchange its grid-wide linear id into a target location within mem_dev, + // receiving back the value previously present therein. This previous value is written to + // old_vals_dev. + // old_vals_dev will not contain values that the final scheduled warp exchanged into mem_dev, but + // mem_dev obviously will. + // Given that mem_dev initially contains values in the range [0, width) and that the maximum value + // the final thread shall write is thread_count + width - 1, presuming correct operation of + // atomicExch, the union of mem_dev and old_vals_dev shall contain values in the range + //[0, thread_count + width) + std::sort(old_vals.begin(), old_vals.end()); + for (auto i = 0u; i < old_vals.size(); ++i) { + REQUIRE(i == old_vals[i]); + } +} + +template void AtomicExchSameAddressTest() { + const auto threads = GENERATE(dim3(1024), dim3(1023), dim3(511), dim3(17), dim3(31)); + + SECTION("Global memory") { + const auto blocks = GENERATE(dim3(20)); + using LA = LinearAllocs; + const auto allocation_type = + GENERATE(LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister); + AtomicExchSameAddress(blocks, threads, allocation_type); + } + + SECTION("Shared memory") { + const auto blocks = dim3(1); + AtomicExchSameAddress(blocks, threads, + LinearAllocs::hipMalloc); + } +} + +struct AtomicExchParams { + dim3 blocks; + dim3 threads; + unsigned int num_devices = 1u; + unsigned int kernel_count = 1u; + unsigned int width = 1u; + unsigned int pitch = 0u; + unsigned int host_thread_count = 0u; + LinearAllocs alloc_type; +}; + + +template +class AtomicExchCRTP { + public: + void run(const AtomicExchParams& p) const { + const auto thread_count = + p.blocks.x * p.blocks.y * p.blocks.z * p.threads.x * p.threads.y * p.threads.z; + + const auto old_vals_alloc_size = p.kernel_count * thread_count * sizeof(T); + std::vector> old_vals_devs; + std::vector streams; + for (auto i = 0; i < p.num_devices; ++i) { + HIP_CHECK(hipSetDevice(i)); + old_vals_devs.emplace_back(LinearAllocs::hipMalloc, old_vals_alloc_size); + for (auto j = 0; j < p.kernel_count; ++j) { + streams.emplace_back(Streams::created); + } + } + + const auto mem_alloc_size = p.width * p.pitch; + LinearAllocGuard mem_dev(p.alloc_type, mem_alloc_size); + + const auto host_iters_per_thread = + std::max(p.num_devices * p.kernel_count * thread_count / 20, p.width); + + std::vector old_vals(p.num_devices * p.kernel_count * thread_count + p.width + + p.host_thread_count * host_iters_per_thread); + std::iota(old_vals.begin(), old_vals.begin() + p.width, 0); + + HIP_CHECK(hipMemcpy2D(mem_dev.ptr(), p.pitch, old_vals.data(), sizeof(T), sizeof(T), p.width, + hipMemcpyHostToDevice)); + + const auto shared_mem_size = use_shared_mem ? mem_alloc_size : 0u; + for (auto i = 0u; i < p.num_devices; ++i) { + const auto device_offset = i * p.kernel_count * thread_count; + for (auto j = 0u; j < p.kernel_count; ++j) { + const auto& stream = streams[i * p.kernel_count + j].stream(); + const auto kern_offset = j * thread_count; + const auto old_vals = old_vals_devs[i].ptr() + kern_offset; + CastToDerived().LaunchKernel(shared_mem_size, stream, mem_dev.ptr(), old_vals, + device_offset + kern_offset, p); + } + } + + PerformHostAtomicExchange(p.host_thread_count, host_iters_per_thread, mem_dev.host_ptr(), + old_vals.data(), p); + + for (auto i = 0u; i < p.num_devices; ++i) { + const auto device_offset = i * p.kernel_count * thread_count; + HIP_CHECK(hipMemcpy(old_vals.data() + device_offset, old_vals_devs[i].ptr(), + old_vals_alloc_size, hipMemcpyDeviceToHost)); + } + HIP_CHECK(hipMemcpy2D(old_vals.data() + p.num_devices * p.kernel_count * thread_count, + sizeof(T), mem_dev.ptr(), p.pitch, sizeof(T), p.width, + hipMemcpyDeviceToHost)); + + CastToDerived().ValidateResults(old_vals); + } + + private: + const Derived& CastToDerived() const { return static_cast(*this); } + + static void HostAtomicExchange(const unsigned int iterations, T* mem, T* const old_vals, + const unsigned int width, const unsigned pitch, T base_val) { + for (auto i = 0u; i < iterations; ++i) { + T new_val = base_val + static_cast(i); + T old_val; + __atomic_exchange(pitched_offset(mem, pitch, i % width), &new_val, &old_val, + __ATOMIC_RELAXED); + old_vals[i] = old_val; + } + } + + void PerformHostAtomicExchange(const unsigned int thread_count, const unsigned int iterations, + T* mem, T* const old_vals, const AtomicExchParams& p) const { + if (thread_count == 0) { + return; + } + const auto dev_threads = + p.blocks.x * p.blocks.y * p.blocks.z * p.threads.x * p.threads.y * p.threads.z; + const auto host_base_val = p.num_devices * p.kernel_count * dev_threads + p.width; + + std::vector threads; + for (auto i = 0u; i < thread_count; ++i) { + const auto thread_base_val = host_base_val + i * iterations; + threads.push_back(std::thread(HostAtomicExchange, iterations, mem, old_vals + thread_base_val, + p.width, p.pitch, thread_base_val)); + } + + for (auto& th : threads) { + th.join(); + } + } +}; + +template +class AtomicExch + : public AtomicExchCRTP, T, use_shared_mem, scope> { + public: + void LaunchKernel(const unsigned int shared_mem_size, const hipStream_t stream, T* const mem, + T* const old_vals, const T base_val, const AtomicExchParams& p) const { + atomic_exch_kernel<<>>( + mem, old_vals, p.width, p.pitch, base_val); + } + + void ValidateResults(std::vector& old_vals) const { + std::sort(old_vals.begin(), old_vals.end()); + for (auto i = 0u; i < old_vals.size(); ++i) { + REQUIRE(i == old_vals[i]); + } + } +}; + +inline dim3 GenerateAtomicExchThreadDimensions() { return GENERATE(dim3(16), dim3(1024)); } + +inline dim3 GenerateAtomicExchBlockDimensions() { + int sm_count = 0; + HIP_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, 0)); + return GENERATE_COPY(dim3(sm_count), dim3(sm_count + sm_count / 2)); +} + +template +void AtomicExchSingleDeviceSingleKernelTest(const unsigned int width, const unsigned int pitch) { + AtomicExchParams params; + params.num_devices = 1; + params.kernel_count = 1; + params.threads = GenerateAtomicExchThreadDimensions(); + params.width = width; + params.pitch = pitch; + + SECTION("Global memory") { + params.blocks = GenerateAtomicExchBlockDimensions(); + using LA = LinearAllocs; + for (const auto alloc_type : + {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) { + params.alloc_type = alloc_type; + DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) { + AtomicExch().run(params); + } + } + } + + SECTION("Shared memory") { + params.blocks = dim3(1); + params.alloc_type = LinearAllocs::hipMalloc; + AtomicExch().run(params); + } +} + +template +void AtomicExchSingleDeviceMultipleKernelTest(const unsigned int kernel_count, + const unsigned int width, const unsigned int pitch) { + int concurrent_kernels = 0; + HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, 0)); + if (!concurrent_kernels) { + HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution"); + return; + } + + AtomicExchParams params; + params.num_devices = 1; + params.kernel_count = kernel_count; + params.blocks = GenerateAtomicExchBlockDimensions(); + params.threads = GenerateAtomicExchThreadDimensions(); + params.width = width; + params.pitch = pitch; + + using LA = LinearAllocs; + for (const auto alloc_type : + {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) { + params.alloc_type = alloc_type; + DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) { + AtomicExch().run(params); + } + } +} + +template +void AtomicExchMultipleDeviceMultipleKernelAndHostTest(const unsigned int num_devices, + const unsigned int kernel_count, + const unsigned int width, + const unsigned int pitch, + const unsigned int host_thread_count = 0u) { + if (num_devices > 1) { + if (HipTest::getDeviceCount() < num_devices) { + std::string msg = std::to_string(num_devices) + " devices are required"; + HipTest::HIP_SKIP_TEST(msg.c_str()); + return; + } + } + + if (kernel_count > 1) { + for (auto i = 0u; i < num_devices; ++i) { + int concurrent_kernels = 0; + HIP_CHECK(hipDeviceGetAttribute(&concurrent_kernels, hipDeviceAttributeConcurrentKernels, i)); + if (!concurrent_kernels) { + HipTest::HIP_SKIP_TEST("Test requires support for concurrent kernel execution"); + return; + } + } + } + + AtomicExchParams params; + params.num_devices = num_devices; + params.kernel_count = kernel_count; + params.blocks = GenerateAtomicExchBlockDimensions(); + params.threads = GenerateAtomicExchThreadDimensions(); + params.width = width; + params.pitch = pitch; + params.host_thread_count = host_thread_count; + + using LA = LinearAllocs; + for (const auto alloc_type : {LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) { + params.alloc_type = alloc_type; + DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) { + AtomicExch().run(params); + } + } +} \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/atomics/atomicExch_negative_kernels.cc b/projects/hip-tests/catch/unit/atomics/atomicExch_negative_kernels.cc new file mode 100644 index 0000000000..083d27747f --- /dev/null +++ b/projects/hip-tests/catch/unit/atomics/atomicExch_negative_kernels.cc @@ -0,0 +1,94 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +struct Dummy { + __device__ Dummy() {} + __device__ ~Dummy() {} +}; + +/*int atomicExch(int*, int)*/ +__global__ void atomicExch_int_n1(int* p, int v) { atomicExch(p, p); } +__global__ void atomicExch_int_n2(int* p, int v) { atomicExch(&p, v); } +__global__ void atomicExch_int_n3(char* p, int v) { atomicExch(p, v); } +__global__ void atomicExch_int_n4(short* p, int v) { atomicExch(p, v); } +__global__ void atomicExch_int_n5(long* p, int v) { atomicExch(p, v); } +__global__ void atomicExch_int_n6(long long* p, int v) { atomicExch(p, v); } +__global__ void atomicExch_int_n7(Dummy* p, int v) { atomicExch(p, v); } +__global__ void atomicExch_int_n8(int* p, Dummy v) { atomicExch(p, v); } + +/*unsigned int atomicExch(unsigned int*, unsigned int)*/ +__global__ void atomicExch_unsigned_int_n1(unsigned int* p, unsigned int v) { atomicExch(p, p); } +__global__ void atomicExch_unsigned_int_n2(unsigned int* p, unsigned int v) { atomicExch(&p, v); } +__global__ void atomicExch_unsigned_int_n3(char* p, unsigned int v) { atomicExch(p, v); } +__global__ void atomicExch_unsigned_int_n4(short* p, unsigned int v) { atomicExch(p, v); } +__global__ void atomicExch_unsigned_int_n5(long* p, unsigned int v) { atomicExch(p, v); } +__global__ void atomicExch_unsigned_int_n6(long long* p, unsigned int v) { atomicExch(p, v); } +__global__ void atomicExch_unsigned_int_n7(Dummy* p, unsigned int v) { atomicExch(p, v); } +__global__ void atomicExch_unsigned_int_n8(unsigned int* p, Dummy v) { atomicExch(p, v); } + +// /*unsigned long long atomicExch(unsigned long long*, unsigned long long)*/ +__global__ void atomicExch_unsigned_long_long_n1(unsigned long long* p, unsigned long long v) { + atomicExch(p, p); +} +__global__ void atomicExch_unsigned_long_long_n2(unsigned long long* p, unsigned long long v) { + atomicExch(&p, v); +} +__global__ void atomicExch_unsigned_long_long_n3(char* p, unsigned long long v) { + atomicExch(p, v); +} +__global__ void atomicExch_unsigned_long_long_n4(short* p, unsigned long long v) { + atomicExch(p, v); +} +__global__ void atomicExch_unsigned_long_long_n5(long* p, unsigned long long v) { + atomicExch(p, v); +} +__global__ void atomicExch_unsigned_long_long_n6(long long* p, unsigned long long v) { + atomicExch(p, v); +} +__global__ void atomicExch_unsigned_long_long_n7(Dummy* p, unsigned long long v) { + atomicExch(p, v); +} +__global__ void atomicExch_unsigned_long_long_n8(unsigned long long* p, Dummy v) { + atomicExch(p, v); +} + +// /*float atomicExch(float*, float)*/ +__global__ void atomicExch_float_n1(float* p, float v) { atomicExch(p, p); } +__global__ void atomicExch_float_n2(float* p, float v) { atomicExch(&p, v); } +__global__ void atomicExch_float_n3(char* p, float v) { atomicExch(p, v); } +__global__ void atomicExch_float_n4(short* p, float v) { atomicExch(p, v); } +__global__ void atomicExch_float_n5(long* p, float v) { atomicExch(p, v); } +__global__ void atomicExch_float_n6(long long* p, float v) { atomicExch(p, v); } +__global__ void atomicExch_float_n7(Dummy* p, float v) { atomicExch(p, v); } +__global__ void atomicExch_float_n8(float* p, Dummy v) { atomicExch(p, v); } + +// /*double atomicExch(double*, double)*/ +__global__ void atomicExch_double_n1(double* p, double v) { atomicExch(p, p); } +__global__ void atomicExch_double_n2(double* p, double v) { atomicExch(&p, v); } +__global__ void atomicExch_double_n3(char* p, double v) { atomicExch(p, v); } +__global__ void atomicExch_double_n4(short* p, double v) { atomicExch(p, v); } +__global__ void atomicExch_double_n5(long* p, double v) { atomicExch(p, v); } +__global__ void atomicExch_double_n6(long long* p, double v) { atomicExch(p, v); } +__global__ void atomicExch_double_n7(Dummy* p, double v) { atomicExch(p, v); } +__global__ void atomicExch_double_n8(double* p, Dummy v) { atomicExch(p, v); } \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/atomics/atomicExch_negative_kernels_rtc.hh b/projects/hip-tests/catch/unit/atomics/atomicExch_negative_kernels_rtc.hh new file mode 100644 index 0000000000..01387bba61 --- /dev/null +++ b/projects/hip-tests/catch/unit/atomics/atomicExch_negative_kernels_rtc.hh @@ -0,0 +1,124 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +static constexpr auto kAtomicExchInt{ + R"( + struct Dummy { + __device__ Dummy() {} + __device__ ~Dummy() {} + }; + + __global__ void atomicExch_int_n1(int* p, int v) { atomicExch(p, p); } + __global__ void atomicExch_int_n2(int* p, int v) { atomicExch(&p, v); } + __global__ void atomicExch_int_n3(char* p, int v) { atomicExch(p, v); } + __global__ void atomicExch_int_n4(short* p, int v) { atomicExch(p, v); } + __global__ void atomicExch_int_n5(long* p, int v) { atomicExch(p, v); } + __global__ void atomicExch_int_n6(long long* p, int v) { atomicExch(p, v); } + __global__ void atomicExch_int_n7(Dummy* p, int v) { atomicExch(p, v); } + __global__ void atomicExch_int_n8(int* p, Dummy v) { atomicExch(p, v); } + )"}; + +static constexpr auto kAtomicExchUnsignedInt{ + R"( + struct Dummy { + __device__ Dummy() {} + __device__ ~Dummy() {} + }; + + __global__ void atomicExch_unsigned_int_n1(unsigned int* p, unsigned int v) { atomicExch(p, p); } + __global__ void atomicExch_unsigned_int_n2(unsigned int* p, unsigned int v) { atomicExch(&p, v); } + __global__ void atomicExch_unsigned_int_n3(char* p, unsigned int v) { atomicExch(p, v); } + __global__ void atomicExch_unsigned_int_n4(short* p, unsigned int v) { atomicExch(p, v); } + __global__ void atomicExch_unsigned_int_n5(long* p, unsigned int v) { atomicExch(p, v); } + __global__ void atomicExch_unsigned_int_n6(long long* p, unsigned int v) { atomicExch(p, v); } + __global__ void atomicExch_unsigned_int_n7(Dummy* p, unsigned int v) { atomicExch(p, v); } + __global__ void atomicExch_unsigned_int_n8(unsigned int* p, Dummy v) { atomicExch(p, v); } + )"}; + +static constexpr auto kAtomicExchULL{ + R"( + struct Dummy { + __device__ Dummy() {} + __device__ ~Dummy() {} + }; + + __global__ void atomicExch_unsigned_long_long_n1(unsigned long long* p, unsigned long long v) { + atomicExch(p, p); + } + __global__ void atomicExch_unsigned_long_long_n2(unsigned long long* p, unsigned long long v) { + atomicExch(&p, v); + } + __global__ void atomicExch_unsigned_long_long_n3(char* p, unsigned long long v) { + atomicExch(p, v); + } + __global__ void atomicExch_unsigned_long_long_n4(short* p, unsigned long long v) { + atomicExch(p, v); + } + __global__ void atomicExch_unsigned_long_long_n5(long* p, unsigned long long v) { + atomicExch(p, v); + } + __global__ void atomicExch_unsigned_long_long_n6(long long* p, unsigned long long v) { + atomicExch(p, v); + } + __global__ void atomicExch_unsigned_long_long_n7(Dummy* p, unsigned long long v) { + atomicExch(p, v); + } + __global__ void atomicExch_unsigned_long_long_n8(unsigned long long* p, Dummy v) { + atomicExch(p, v); + } + )"}; + +static constexpr auto kAtomicExchFloat{ + R"( + struct Dummy { + __device__ Dummy() {} + __device__ ~Dummy() {} + }; + + __global__ void atomicExch_float_n1(float* p, float v) { atomicExch(p, p); } + __global__ void atomicExch_float_n2(float* p, float v) { atomicExch(&p, v); } + __global__ void atomicExch_float_n3(char* p, float v) { atomicExch(p, v); } + __global__ void atomicExch_float_n4(short* p, float v) { atomicExch(p, v); } + __global__ void atomicExch_float_n5(long* p, float v) { atomicExch(p, v); } + __global__ void atomicExch_float_n6(long long* p, float v) { atomicExch(p, v); } + __global__ void atomicExch_float_n7(Dummy* p, float v) { atomicExch(p, v); } + __global__ void atomicExch_float_n8(float* p, Dummy v) { atomicExch(p, v); } + )"}; + +static constexpr auto kAtomicExchDouble{ + R"( + struct Dummy { + __device__ Dummy() {} + __device__ ~Dummy() {} + }; + + __global__ void atomicExch_double_n1(double* p, double v) { atomicExch(p, p); } + __global__ void atomicExch_double_n2(double* p, double v) { atomicExch(&p, v); } + __global__ void atomicExch_double_n3(char* p, double v) { atomicExch(p, v); } + __global__ void atomicExch_double_n4(short* p, double v) { atomicExch(p, v); } + __global__ void atomicExch_double_n5(long* p, double v) { atomicExch(p, v); } + __global__ void atomicExch_double_n6(long long* p, double v) { atomicExch(p, v); } + __global__ void atomicExch_double_n7(Dummy* p, double v) { atomicExch(p, v); } + __global__ void atomicExch_double_n8(double* p, Dummy v) { atomicExch(p, v); } + )"}; \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/atomics/atomicExch_system.cc b/projects/hip-tests/catch/unit/atomics/atomicExch_system.cc new file mode 100644 index 0000000000..5cecd72968 --- /dev/null +++ b/projects/hip-tests/catch/unit/atomics/atomicExch_system.cc @@ -0,0 +1,235 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "atomicExch_common.hh" +#include "atomicExch_system_negative_kernels_rtc.hh" + +/** + * @addtogroup atomicExch_system atomicExch_system + * @{ + * @ingroup AtomicsTest + */ + +/** + * Test Description + * ------------------------ + * - Executes a kernel two times concurrently on two devices wherein all threads will perform + * an atomic exchange into a runtime determined memory location. Each thread will exchange its own + * grid wide linear index + offset into the memory location, storing the return value into a + * separate output array slot corresponding to it. Once complete, the union of output array and + * exchange memory is validated to contain all values in the range [0, number_of_threads + + * number_of_exchange_memory_slots). Several memory access patterns are tested: + * -# All threads exchange to a single memory location + * -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size + * for indexing + * -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes. + * + * - The test is run for: + * - All overloads of atomicExch_system + * - hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory + * - Several grid and block dimension combinations + * Test source + * ------------------------ + * - unit/atomics/atomicExch_system.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +#if HT_NVIDIA +TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Peer_GPUs", "", int, unsigned int, + unsigned long long, float) { +#else +TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Peer_GPUs", "", int, unsigned int, + unsigned long, unsigned long long, float, double) { +#endif // HT_NVIDIA + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + const auto cache_line_size = 128u; + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Same address " << current) { + AtomicExchMultipleDeviceMultipleKernelAndHostTest(2, 2, 1, sizeof(TestType)); + } + + DYNAMIC_SECTION("Adjacent addresses " << current) { + AtomicExchMultipleDeviceMultipleKernelAndHostTest(2, 2, warp_size, + sizeof(TestType)); + } + + DYNAMIC_SECTION("Scattered addresses " << current) { + AtomicExchMultipleDeviceMultipleKernelAndHostTest(2, 2, warp_size, cache_line_size); + } + } +} + +/** + * Test Description + * ------------------------ + * - Executes a kernel on a single device wherein all threads will perform an atomic exchange + * into a runtime determined memory location. Each thread will exchange its own grid wide linear + * index + offset into the memory location, storing the return value into a separate output array + * slot corresponding to it. While the kernel is running, the host performs atomic exchanges, in 4 + * threads, into the same memory location(s). Once complete, the union of output array, exchange + * memory, and host output is validated to contain all values in the range [0, number_of_threads + + * number_of_exchange_memory_slots + number_of_host_iterations). Several memory access patterns are + * tested: + * -# All threads exchange to a single memory location + * -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size + * for indexing + * -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes. + * + * - The test is run for: + * - All overloads of atomicExch_system + * - hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory + * - Several grid and block dimension combinations + * Test source + * ------------------------ + * - unit/atomics/atomicExch_system.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +#if HT_NVIDIA +TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Host_And_GPU", "", int, unsigned int, + unsigned long long, float) { +#else +TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Host_And_GPU", "", int, unsigned int, + unsigned long, unsigned long long, float, double) { +#endif // HT_NVIDIA + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + const auto cache_line_size = 128u; + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Same address " << current) { + AtomicExchMultipleDeviceMultipleKernelAndHostTest(1, 1, 1, sizeof(TestType), 4); + } + + DYNAMIC_SECTION("Adjacent addresses " << current) { + AtomicExchMultipleDeviceMultipleKernelAndHostTest(1, 1, warp_size, sizeof(TestType), + 4); + } + + DYNAMIC_SECTION("Scattered addresses " << current) { + AtomicExchMultipleDeviceMultipleKernelAndHostTest(1, 1, warp_size, cache_line_size, + 4); + } + } +} + +/** + * Test Description + * ------------------------ + * - Executes a kernel two times concurrently on two devices wherein all threads will perform + * an atomic exchange into a runtime determined memory location. Each thread will exchange its own + * grid wide linear index + offset into the memory location, storing the return value into a + * separate output array slot corresponding to it. While the kernels are running, the + * host performs atomic exchanges, in 4 threads, into the same memory location(s). Once complete, + * the union of output array, exchange memory, and host output is validated to contain all values in + * the range [0, number_of_threads + number_of_exchange_memory_slots + number_of_host_iterations). + * Several memory access patterns are tested: + * -# All threads exchange to a single memory location + * -# Each thread exchanges into an array containing warp_size elements, using tid % warp_size + * for indexing + * -# Same as the above, but the exchange elements are spread out by L1 cache line size bytes. + * + * - The test is run for: + * - All overloads of atomicExch_system + * - hipMallocManaged, hipHostMalloc and hipHostRegister allocated exchange memory + * - Several grid and block dimension combinations + * Test source + * ------------------------ + * - unit/atomics/atomicExch_system.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +#if HT_NVIDIA +TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Host_And_Peer_GPUs", "", int, unsigned int, + unsigned long long, float) { +#else +TEMPLATE_TEST_CASE("Unit_atomicExch_system_Positive_Host_And_Peer_GPUs", "", int, unsigned int, + unsigned long, unsigned long long, float, double) { +#endif // HT_NVIDIA + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + const auto cache_line_size = 128u; + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Same address " << current) { + AtomicExchMultipleDeviceMultipleKernelAndHostTest(2, 2, 1, sizeof(TestType), 4); + } + + DYNAMIC_SECTION("Adjacent addresses " << current) { + AtomicExchMultipleDeviceMultipleKernelAndHostTest(2, 2, warp_size, sizeof(TestType), + 4); + } + + DYNAMIC_SECTION("Scattered addresses " << current) { + AtomicExchMultipleDeviceMultipleKernelAndHostTest(2, 2, warp_size, cache_line_size, + 4); + } + } +} + +/** + * Test Description + * ------------------------ + * - RTCs kernels that pass combinations of arguments of invalid types for all overloads of + * atomicExch_system + * Test source + * ------------------------ + * - unit/atomics/atomicExch_system.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_atomicExch_system_Negative_Parameters_RTC") { + hiprtcProgram program{}; + + const auto program_source = + GENERATE(kAtomicExchSystemInt, kAtomicExchSystemUnsignedInt, kAtomicExchSystemULL, + kAtomicExchSystemFloat, kAtomicExchSystemDouble); + HIPRTC_CHECK( + hiprtcCreateProgram(&program, program_source, "atomicExch_negative.cc", 0, nullptr, nullptr)); + hiprtcResult result{hiprtcCompileProgram(program, 0, nullptr)}; + + // Get the compile log and count compiler error messages + size_t log_size{}; + HIPRTC_CHECK(hiprtcGetProgramLogSize(program, &log_size)); + std::string log(log_size, ' '); + HIPRTC_CHECK(hiprtcGetProgramLog(program, log.data())); + int error_count{0}; + + int expected_error_count{8}; + std::string error_message{"error:"}; + + size_t n_pos = log.find(error_message, 0); + while (n_pos != std::string::npos) { + ++error_count; + n_pos = log.find(error_message, n_pos + 1); + } + + HIPRTC_CHECK(hiprtcDestroyProgram(&program)); + HIPRTC_CHECK_ERROR(result, HIPRTC_ERROR_COMPILATION); + REQUIRE(error_count == expected_error_count); +} diff --git a/projects/hip-tests/catch/unit/atomics/atomicExch_system_negative_kernels.cc b/projects/hip-tests/catch/unit/atomics/atomicExch_system_negative_kernels.cc new file mode 100644 index 0000000000..15085f7da3 --- /dev/null +++ b/projects/hip-tests/catch/unit/atomics/atomicExch_system_negative_kernels.cc @@ -0,0 +1,112 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +struct Dummy { + __device__ Dummy() {} + __device__ ~Dummy() {} +}; + +/*int atomicExch_system(int*, int)*/ +__global__ void atomicExch_system_int_n1(int* p, int v) { atomicExch_system(p, p); } +__global__ void atomicExch_system_int_n2(int* p, int v) { atomicExch_system(&p, v); } +__global__ void atomicExch_system_int_n3(char* p, int v) { atomicExch_system(p, v); } +__global__ void atomicExch_system_int_n4(short* p, int v) { atomicExch_system(p, v); } +__global__ void atomicExch_system_int_n5(long* p, int v) { atomicExch_system(p, v); } +__global__ void atomicExch_system_int_n6(long long* p, int v) { atomicExch_system(p, v); } +__global__ void atomicExch_system_int_n7(Dummy* p, int v) { atomicExch_system(p, v); } +__global__ void atomicExch_system_int_n8(int* p, Dummy v) { atomicExch_system(p, v); } + +/*unsigned int atomicExch_system(unsigned int*, unsigned int)*/ +__global__ void atomicExch_system_unsigned_int_n1(unsigned int* p, unsigned int v) { + atomicExch_system(p, p); +} +__global__ void atomicExch_system_unsigned_int_n2(unsigned int* p, unsigned int v) { + atomicExch_system(&p, v); +} +__global__ void atomicExch_system_unsigned_int_n3(char* p, unsigned int v) { + atomicExch_system(p, v); +} +__global__ void atomicExch_system_unsigned_int_n4(short* p, unsigned int v) { + atomicExch_system(p, v); +} +__global__ void atomicExch_system_unsigned_int_n5(long* p, unsigned int v) { + atomicExch_system(p, v); +} +__global__ void atomicExch_system_unsigned_int_n6(long long* p, unsigned int v) { + atomicExch_system(p, v); +} +__global__ void atomicExch_system_unsigned_int_n7(Dummy* p, unsigned int v) { + atomicExch_system(p, v); +} +__global__ void atomicExch_system_unsigned_int_n8(unsigned int* p, Dummy v) { + atomicExch_system(p, v); +} + +// /*unsigned long long atomicExch_system(unsigned long long*, unsigned long long)*/ +__global__ void atomicExch_system_unsigned_long_long_n1(unsigned long long* p, + unsigned long long v) { + atomicExch_system(p, p); +} +__global__ void atomicExch_system_unsigned_long_long_n2(unsigned long long* p, + unsigned long long v) { + atomicExch_system(&p, v); +} +__global__ void atomicExch_system_unsigned_long_long_n3(char* p, unsigned long long v) { + atomicExch_system(p, v); +} +__global__ void atomicExch_system_unsigned_long_long_n4(short* p, unsigned long long v) { + atomicExch_system(p, v); +} +__global__ void atomicExch_system_unsigned_long_long_n5(long* p, unsigned long long v) { + atomicExch_system(p, v); +} +__global__ void atomicExch_system_unsigned_long_long_n6(long long* p, unsigned long long v) { + atomicExch_system(p, v); +} +__global__ void atomicExch_system_unsigned_long_long_n7(Dummy* p, unsigned long long v) { + atomicExch_system(p, v); +} +__global__ void atomicExch_system_unsigned_long_long_n8(unsigned long long* p, Dummy v) { + atomicExch_system(p, v); +} + +// /*float atomicExch_system(float*, float)*/ +__global__ void atomicExch_system_float_n1(float* p, float v) { atomicExch_system(p, p); } +__global__ void atomicExch_system_float_n2(float* p, float v) { atomicExch_system(&p, v); } +__global__ void atomicExch_system_float_n3(char* p, float v) { atomicExch_system(p, v); } +__global__ void atomicExch_system_float_n4(short* p, float v) { atomicExch_system(p, v); } +__global__ void atomicExch_system_float_n5(long* p, float v) { atomicExch_system(p, v); } +__global__ void atomicExch_system_float_n6(long long* p, float v) { atomicExch_system(p, v); } +__global__ void atomicExch_system_float_n7(Dummy* p, float v) { atomicExch_system(p, v); } +__global__ void atomicExch_system_float_n8(float* p, Dummy v) { atomicExch_system(p, v); } + +// /*double atomicExch_system(double*, double)*/ +__global__ void atomicExch_system_double_n1(double* p, double v) { atomicExch_system(p, p); } +__global__ void atomicExch_system_double_n2(double* p, double v) { atomicExch_system(&p, v); } +__global__ void atomicExch_system_double_n3(char* p, double v) { atomicExch_system(p, v); } +__global__ void atomicExch_system_double_n4(short* p, double v) { atomicExch_system(p, v); } +__global__ void atomicExch_system_double_n5(long* p, double v) { atomicExch_system(p, v); } +__global__ void atomicExch_system_double_n6(long long* p, double v) { atomicExch_system(p, v); } +__global__ void atomicExch_system_double_n7(Dummy* p, double v) { atomicExch_system(p, v); } +__global__ void atomicExch_system_double_n8(double* p, Dummy v) { atomicExch_system(p, v); } \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/atomics/atomicExch_system_negative_kernels_rtc.hh b/projects/hip-tests/catch/unit/atomics/atomicExch_system_negative_kernels_rtc.hh new file mode 100644 index 0000000000..69fc31fb35 --- /dev/null +++ b/projects/hip-tests/catch/unit/atomics/atomicExch_system_negative_kernels_rtc.hh @@ -0,0 +1,142 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +static constexpr auto kAtomicExchSystemInt{ + R"( + struct Dummy { + __device__ Dummy() {} + __device__ ~Dummy() {} + }; + + __global__ void atomicExch_system_int_n1(int* p, int v) { atomicExch_system(p, p); } + __global__ void atomicExch_system_int_n2(int* p, int v) { atomicExch_system(&p, v); } + __global__ void atomicExch_system_int_n3(char* p, int v) { atomicExch_system(p, v); } + __global__ void atomicExch_system_int_n4(short* p, int v) { atomicExch_system(p, v); } + __global__ void atomicExch_system_int_n5(long* p, int v) { atomicExch_system(p, v); } + __global__ void atomicExch_system_int_n6(long long* p, int v) { atomicExch_system(p, v); } + __global__ void atomicExch_system_int_n7(Dummy* p, int v) { atomicExch_system(p, v); } + __global__ void atomicExch_system_int_n8(int* p, Dummy v) { atomicExch_system(p, v); } + )"}; + +static constexpr auto kAtomicExchSystemUnsignedInt{ + R"( + struct Dummy { + __device__ Dummy() {} + __device__ ~Dummy() {} + }; + + __global__ void atomicExch_system_unsigned_int_n1(unsigned int* p, unsigned int v) { + atomicExch_system(p, p); + } + __global__ void atomicExch_system_unsigned_int_n2(unsigned int* p, unsigned int v) { + atomicExch_system(&p, v); + } + __global__ void atomicExch_system_unsigned_int_n3(char* p, unsigned int v) { + atomicExch_system(p, v); + } + __global__ void atomicExch_system_unsigned_int_n4(short* p, unsigned int v) { + atomicExch_system(p, v); + } + __global__ void atomicExch_system_unsigned_int_n5(long* p, unsigned int v) { + atomicExch_system(p, v); + } + __global__ void atomicExch_system_unsigned_int_n6(long long* p, unsigned int v) { + atomicExch_system(p, v); + } + __global__ void atomicExch_system_unsigned_int_n7(Dummy* p, unsigned int v) { + atomicExch_system(p, v); + } + __global__ void atomicExch_system_unsigned_int_n8(unsigned int* p, Dummy v) { + atomicExch_system(p, v); + } + )"}; + +static constexpr auto kAtomicExchSystemULL{ + R"( + struct Dummy { + __device__ Dummy() {} + __device__ ~Dummy() {} + }; + + __global__ void atomicExch_system_unsigned_long_long_n1(unsigned long long* p, + unsigned long long v) { + atomicExch_system(p, p); + } + __global__ void atomicExch_system_unsigned_long_long_n2(unsigned long long* p, + unsigned long long v) { + atomicExch_system(&p, v); + } + __global__ void atomicExch_system_unsigned_long_long_n3(char* p, unsigned long long v) { + atomicExch_system(p, v); + } + __global__ void atomicExch_system_unsigned_long_long_n4(short* p, unsigned long long v) { + atomicExch_system(p, v); + } + __global__ void atomicExch_system_unsigned_long_long_n5(long* p, unsigned long long v) { + atomicExch_system(p, v); + } + __global__ void atomicExch_system_unsigned_long_long_n6(long long* p, unsigned long long v) { + atomicExch_system(p, v); + } + __global__ void atomicExch_system_unsigned_long_long_n7(Dummy* p, unsigned long long v) { + atomicExch_system(p, v); + } + __global__ void atomicExch_system_unsigned_long_long_n8(unsigned long long* p, Dummy v) { + atomicExch_system(p, v); + } + )"}; + +static constexpr auto kAtomicExchSystemFloat{ + R"( + struct Dummy { + __device__ Dummy() {} + __device__ ~Dummy() {} + }; + + __global__ void atomicExch_system_float_n1(float* p, float v) { atomicExch_system(p, p); } + __global__ void atomicExch_system_float_n2(float* p, float v) { atomicExch_system(&p, v); } + __global__ void atomicExch_system_float_n3(char* p, float v) { atomicExch_system(p, v); } + __global__ void atomicExch_system_float_n4(short* p, float v) { atomicExch_system(p, v); } + __global__ void atomicExch_system_float_n5(long* p, float v) { atomicExch_system(p, v); } + __global__ void atomicExch_system_float_n6(long long* p, float v) { atomicExch_system(p, v); } + __global__ void atomicExch_system_float_n7(Dummy* p, float v) { atomicExch_system(p, v); } + __global__ void atomicExch_system_float_n8(float* p, Dummy v) { atomicExch_system(p, v); } + )"}; + +static constexpr auto kAtomicExchSystemDouble{ + R"( + struct Dummy { + __device__ Dummy() {} + __device__ ~Dummy() {} + }; + + __global__ void atomicExch_system_double_n1(double* p, double v) { atomicExch_system(p, p); } + __global__ void atomicExch_system_double_n2(double* p, double v) { atomicExch_system(&p, v); } + __global__ void atomicExch_system_double_n3(char* p, double v) { atomicExch_system(p, v); } + __global__ void atomicExch_system_double_n4(short* p, double v) { atomicExch_system(p, v); } + __global__ void atomicExch_system_double_n5(long* p, double v) { atomicExch_system(p, v); } + __global__ void atomicExch_system_double_n6(long long* p, double v) { atomicExch_system(p, v); } + __global__ void atomicExch_system_double_n7(Dummy* p, double v) { atomicExch_system(p, v); } + __global__ void atomicExch_system_double_n8(double* p, Dummy v) { atomicExch_system(p, v); } + )"}; \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/compileAndCaptureOutput.py b/projects/hip-tests/catch/unit/compileAndCaptureOutput.py new file mode 100644 index 0000000000..c06feb5858 --- /dev/null +++ b/projects/hip-tests/catch/unit/compileAndCaptureOutput.py @@ -0,0 +1,107 @@ +# Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import subprocess +import sys +import unittest + +class CompileAndCapture(unittest.TestCase): + path = None + expected_error_count = 0 + expected_warning_count = 0 + hip_path = None + file = None + error_string = None + warning_string = None + platform = None + + def setUp(self): + self.error_string = 'error:' + self.warning_string = 'warning:' + self.assertFalse(self.hip_path == None) + self.assertFalse(self.path == None) + self.assertFalse(self.file == None) + self.assertTrue(self.platform == 'amd' or self.platform == 'nvidia') + + def test(self): + compiler_args = [ + self.hip_path + '/bin/hipcc', + '-I' + self.path + '/../../external/Catch2', + '-I' + self.path + '/../../include', + '-I' + self.path + '/../../external/picojson', + '-c', + self.path + '/' + self.file, + ] + # HIP compiler on AMD platforms has limit of 20 errors, and some negative + # test cases expect that more errors are detected. + if (self.platform == 'amd'): + compiler_args.append('-ferror-limit=100') + compiler_output = subprocess.run(compiler_args, stderr=subprocess.PIPE) + # Get the compiler output in the stdout if -V flag is raised during ctest invocation. + compiler_stderr = compiler_output.stderr.decode('UTF-8') + print(compiler_stderr) + + error_count = compiler_stderr.count(self.error_string) + if self.expected_error_count < 0: + self.assertGreater(error_count, 0) + else: + self.assertEqual(error_count, self.expected_error_count) + + warning_count = compiler_stderr.count(self.warning_string) + if self.expected_warning_count < 0: + self.assertGreater(warning_count, 0) + else: + self.assertEqual(warning_count, self.expected_warning_count) + +if __name__ == '__main__': + try: + CompileAndCapture.path = sys.argv[1] + except IndexError: + CompileAndCapture.path = None + + try: + CompileAndCapture.platform = sys.argv[2] + except IndexError: + CompileAndCapture.platform = None + + try: + CompileAndCapture.hip_path = sys.argv[3] + except IndexError: + CompileAndCapture.hip_path = None + + try: + CompileAndCapture.file = sys.argv[4] + except IndexError: + CompileAndCapture.file = None + + try: + CompileAndCapture.expected_error_count = int(sys.argv[5]) + except IndexError: + CompileAndCapture.expected_error_count = 0 + + try: + CompileAndCapture.expected_warning_count = int(sys.argv[6]) + except IndexError: + CompileAndCapture.expected_warning_count = 0 + + # Unittest looks at the same argv's as the __main__ and doesn't know how + # to handle arguments other than the executable (0). Therefore passing only + # executable as the argv for unittest module. + unittest.main(argv=[sys.argv[0]]) diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/CMakeLists.txt b/projects/hip-tests/catch/unit/cooperativeGrps/CMakeLists.txt index 829758b622..1090a68382 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/CMakeLists.txt +++ b/projects/hip-tests/catch/unit/cooperativeGrps/CMakeLists.txt @@ -1,26 +1,25 @@ # Common Tests - Test independent of all platforms set(TEST_SRC - hipCGThreadBlockType.cc - hipCGThreadBlockTypeViaBaseType.cc - hipCGThreadBlockTypeViaPublicApi.cc - hipCGMultiGridGroupType.cc - hipCGMultiGridGroupTypeViaBaseType.cc - hipCGMultiGridGroupTypeViaPublicApi.cc + hipCGThreadBlockType_old.cc + hipCGMultiGridGroupType_old.cc + hipCGGridGroupType_old.cc + hipCGTiledPartitionType_old.cc + hipCGThreadBlockTileTypeShfl_old.cc + hipCGCoalescedGroups_old.cc + hipLaunchCooperativeKernel_old.cc + hipLaunchCooperativeKernelMultiDevice_old.cc grid_group.cc coalesced_groups_shfl_down.cc coalesced_groups_shfl_up.cc - hipCGTiledPartition.cc - hipCGCoalescedGroups.cc coalesced_tiled_groups_metagrp.cc ) if(HIP_PLATFORM STREQUAL "nvidia") - set_source_files_properties(hipCGMultiGridGroupType.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") - set_source_files_properties(hipCGMultiGridGroupTypeViaBaseType.cc PROPERTIES COMPILE_FLAGS "-D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") - set_source_files_properties(hipCGMultiGridGroupTypeViaPublicApi.cc PROPERTIES COMPILE_FLAGS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") + set_source_files_properties(hipCGMultiGridGroupType_old.cc PROPERTIES COMPILE_FLAGS "-D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") + set_source_files_properties(hipLaunchCooperativeKernelMultiDevice_old.cc PROPERTIES COMPILE_FLAGS "-D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") hip_add_exe_to_target(NAME coopGrpTest TEST_SRC ${TEST_SRC} TEST_TARGET_NAME build_tests - LINKER_LIBS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80") + LINKER_LIBS "-rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80, -gencode arch=compute_86,code=sm_86, -gencode=arch=compute_86,code=compute_86") else() hip_add_exe_to_target(NAME coopGrpTest TEST_SRC ${TEST_SRC} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGCoalescedGroups.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGCoalescedGroups_old.cc similarity index 100% rename from projects/hip-tests/catch/unit/cooperativeGrps/hipCGCoalescedGroups.cc rename to projects/hip-tests/catch/unit/cooperativeGrps/hipCGCoalescedGroups_old.cc diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGGridGroupType_old.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGGridGroupType_old.cc new file mode 100644 index 0000000000..9f908a5be6 --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGGridGroupType_old.cc @@ -0,0 +1,496 @@ +/* +Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include + +#include "hip_cg_common.hh" + +namespace cg = cooperative_groups; + +static __device__ int gm[2]; + +static __global__ void kernel_cg_grid_group_type(int* size_dev, int* thd_rank_dev, + int* is_valid_dev, int* sync_dev) { + cg::grid_group gg = cg::this_grid(); + int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; + + // Test size + size_dev[gIdx] = gg.size(); + + // Test thread_rank + thd_rank_dev[gIdx] = gg.thread_rank(); + + // Test is_valid + is_valid_dev[gIdx] = gg.is_valid(); + + // Test sync + if (blockIdx.x == 0 && threadIdx.x == 0) + gm[0] = 10; + else if (blockIdx.x == 1 && threadIdx.x == 0) + gm[1] = 20; + gg.sync(); + sync_dev[gIdx] = gm[1] * gm[0]; +} + +static __global__ void kernel_cg_grid_group_type_via_base_type(int* size_dev, int* thd_rank_dev, + int* is_valid_dev, int* sync_dev) { + cg::thread_group tg = cg::this_grid(); + int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; + + // Test size + size_dev[gIdx] = tg.size(); + + // Test thread_rank + thd_rank_dev[gIdx] = tg.thread_rank(); + + // Test is_valid +#ifdef __HIP_PLATFORM_AMD__ + is_valid_dev[gIdx] = tg.is_valid(); +#else + // Cuda has no thread_group.is_valid() + is_valid_dev[gIdx] = true; +#endif + + // Test sync + if (blockIdx.x == 0 && threadIdx.x == 0) + gm[0] = 10; + else if (blockIdx.x == 1 && threadIdx.x == 0) + gm[1] = 20; + tg.sync(); + sync_dev[gIdx] = gm[1] * gm[0]; +} + +static __global__ void kernel_cg_grid_group_type_via_public_api(int* size_dev, int* thd_rank_dev, + int* is_valid_dev, int* sync_dev) { + cg::grid_group gg = cg::this_grid(); + int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; + + // Test group_size api + size_dev[gIdx] = cg::group_size(gg); + + // Test thread_rank api + thd_rank_dev[gIdx] = cg::thread_rank(gg); + + // Test is_valid api + is_valid_dev[gIdx] = gg.is_valid(); + + // Test sync + if (blockIdx.x == 0 && threadIdx.x == 0) + gm[0] = 10; + else if (blockIdx.x == 1 && threadIdx.x == 0) + gm[1] = 20; + cg::sync(gg); + sync_dev[gIdx] = gm[1] * gm[0]; +} + +static __global__ void coop_kernel(unsigned int* first_array, unsigned int* second_array, + unsigned int loops, unsigned int array_len) { + cg::grid_group grid = cg::this_grid(); + unsigned int rank = grid.thread_rank(); + unsigned int grid_size = grid.size(); + + for (int i = 0; i < loops; i++) { + // The goal of this loop is to directly add in values from + // array one into array two, on a per-wave basis. + for (int offset = rank; offset < array_len; offset += grid_size) { + second_array[offset] += first_array[offset]; + } + + grid.sync(); + + // The goal of this loop is to pull data the "mirror" lane in + // array two and add it back into array one. This causes inter- + // thread swizzling. + for (int offset = rank; offset < array_len; offset += grid_size) { + unsigned int swizzle_offset = array_len - offset - 1; + first_array[offset] += second_array[swizzle_offset]; + } + + grid.sync(); + } +} + +static __global__ void test_kernel(unsigned int* atomic_val, unsigned int* array, + unsigned int loops) { + cg::grid_group grid = cg::this_grid(); + unsigned rank = grid.thread_rank(); + + int offset = blockIdx.x; + for (int i = 0; i < loops; i++) { + // Make the last thread run way behind everyone else. + // If the barrier below fails, then the other threads may hit the + // atomicInc instruction many times before the last thread ever gets to it. + // As such, without the barrier, the last array entry will eventually + // contain a very large value, defined by however many times the other + // wavefronts make it through this loop. + // If the barrier works, then it will likely contain some number + // near "total number of blocks". It will be the last wavefront to + // reach the atomicInc, but everyone will have only hit the atomic once. + if (rank == (grid.size() - 1)) { + long long time_diff = 0; + long long last_clock = clock64(); + do { + long long cur_clock = clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while (time_diff < 1000000); + } + + if (threadIdx.x == 0) { + array[offset] = atomicInc(&atomic_val[0], UINT_MAX); + } + grid.sync(); + offset += gridDim.x; + } +} + +__global__ void test_kernel_gfx11(unsigned int* atomic_val, unsigned int* array, + unsigned int loops) { +#if HT_AMD + cg::grid_group grid = cg::this_grid(); + unsigned rank = grid.thread_rank(); + + int offset = blockIdx.x; + for (int i = 0; i < loops; i++) { + // Make the last thread run way behind everyone else. + // If the barrier below fails, then the other threads may hit the + // atomicInc instruction many times before the last thread ever gets + // to it. + // As such, without the barrier, the last array entry will eventually + // contain a very large value, defined by however many times the other + // wavefronts make it through this loop. + // If the barrier works, then it will likely contain some number + // near "total number of blocks". It will be the last wavefront to + // reach the atomicInc, but everyone will have only hit the atomic once. + if (rank == (grid.size() - 1)) { + long long time_diff = 0; + long long last_clock = wall_clock64(); + do { + long long cur_clock = wall_clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while (time_diff < 1000000); + } + + if (threadIdx.x == 0) { + array[offset] = atomicInc(&atomic_val[0], UINT_MAX); + } + grid.sync(); + offset += gridDim.x; + } +#endif +} + +static void verify_coop_buffers(unsigned int* host_input, unsigned int* first_array, + unsigned int* second_array, unsigned int loops, + unsigned int array_len) { + unsigned int* expected_first_array = host_input; + unsigned int* expected_second_array = + reinterpret_cast(malloc(sizeof(unsigned int) * array_len)); + memset(expected_second_array, 0, sizeof(unsigned int) * array_len); + + for (int i = 0; i < loops; i++) { + for (int offset = 0; offset < array_len; offset++) { + expected_second_array[offset] += expected_first_array[offset]; + } + + for (int offset = 0; offset < array_len; offset++) { + unsigned int swizzle_offset = array_len - offset - 1; + expected_first_array[offset] += expected_second_array[swizzle_offset]; + } + } + + for (int i = 0; i < array_len; i++) { + REQUIRE(first_array[i] == expected_first_array[i]); + REQUIRE(second_array[i] == expected_second_array[i]); + } + + free(expected_second_array); +} + +static void verify_barrier_buffer(unsigned int loops, unsigned int warps, + unsigned int* host_buffer) { + unsigned int max_in_this_loop = 0; + for (unsigned int i = 0; i < loops; i++) { + max_in_this_loop += warps; + for (unsigned int j = 0; j < warps; j++) { + REQUIRE(host_buffer[i * warps + j] <= max_in_this_loop); + } + } +} + +template static void test_cg_grid_group_type(F kernel_func, int block_size) { + int num_bytes = sizeof(int) * 2 * block_size; + int *size_dev, *size_host; + int *thd_rank_dev, *thd_rank_host; + int *is_valid_dev, *is_valid_host; + int *sync_dev, *sync_host; + + // Allocate device memory + HIP_CHECK(hipMalloc(&size_dev, num_bytes)); + HIP_CHECK(hipMalloc(&thd_rank_dev, num_bytes)); + HIP_CHECK(hipMalloc(&is_valid_dev, num_bytes)); + HIP_CHECK(hipMalloc(&sync_dev, num_bytes)); + + // Allocate host memory + HIP_CHECK(hipHostMalloc(&size_host, num_bytes)); + HIP_CHECK(hipHostMalloc(&thd_rank_host, num_bytes)); + HIP_CHECK(hipHostMalloc(&is_valid_host, num_bytes)); + HIP_CHECK(hipHostMalloc(&sync_host, num_bytes)); + + // Launch Kernel + void* params[4]; + params[0] = &size_dev; + params[1] = &thd_rank_dev; + params[2] = &is_valid_dev; + params[3] = &sync_dev; + HIP_CHECK(hipLaunchCooperativeKernel(kernel_func, 2, block_size, params, 0, 0)); + + // Copy result from device to host + HIP_CHECK(hipMemcpy(size_host, size_dev, num_bytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(thd_rank_host, thd_rank_dev, num_bytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(is_valid_host, is_valid_dev, num_bytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(sync_host, sync_dev, num_bytes, hipMemcpyDeviceToHost)); + + // Validate results for both blocks together + for (int i = 0; i < 2 * block_size; ++i) { + ASSERT_EQUAL(size_host[i], 2 * block_size); + ASSERT_EQUAL(thd_rank_host[i], i); + ASSERT_EQUAL(is_valid_host[i], 1); + ASSERT_EQUAL(sync_host[i], 200); + } + + // Free device memory + HIP_CHECK(hipFree(size_dev)); + HIP_CHECK(hipFree(thd_rank_dev)); + HIP_CHECK(hipFree(is_valid_dev)); + HIP_CHECK(hipFree(sync_dev)); + + // Free host memory + HIP_CHECK(hipHostFree(size_host)); + HIP_CHECK(hipHostFree(thd_rank_host)); + HIP_CHECK(hipHostFree(is_valid_host)); + HIP_CHECK(hipHostFree(sync_host)); +} + +TEST_CASE("Unit_hipCGGridGroupType_Basic") { + // Use default device for validating the test + int device; + hipDeviceProp_t device_properties; + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); + + if (!device_properties.cooperativeLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + + void* (*kernel_func)(void); + + SECTION("Default grid group API test") { + kernel_func = reinterpret_cast(kernel_cg_grid_group_type); + } +#if HT_AMD + SECTION("Base type grid group API test") { + kernel_func = reinterpret_cast(kernel_cg_grid_group_type_via_base_type); + } +#endif + + SECTION("Public API grid group test") { + kernel_func = reinterpret_cast(kernel_cg_grid_group_type_via_public_api); + } + + // Test for block_size in powers of 2 + int max_threads_per_blk = device_properties.maxThreadsPerBlock; + for (int block_size = 2; block_size <= max_threads_per_blk; block_size = block_size * 2) { + test_cg_grid_group_type(kernel_func, block_size); + } + + // Test for random blockSizes, but the sequence is the same every execution + srand(0); + for (int i = 0; i < 10; i++) { + // Test fails for only 1 thread per block + test_cg_grid_group_type(kernel_func, max(2, rand() % max_threads_per_blk)); + } +} + +TEST_CASE("Unit_hipCGGridGroupType_DataSharing") { + const auto device = GENERATE(range(0, HipTest::getDeviceCount())); + HIP_CHECK(hipSetDevice(device)); + + hipDeviceProp_t device_properties; + + HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); + + if (!device_properties.cooperativeLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + + int loops = GENERATE(1, 2, 3, 4); + int width = GENERATE(512, 1024, 2048, 4096); + + // Launch enough waves to fill up all of the GPU + int warp_size = device_properties.warpSize; + int num_sms = device_properties.multiProcessorCount; + + // Calculate the device occupancy to know how many blocks can be run. + int max_blocks_per_sm; + HIP_CHECK( + hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, coop_kernel, warp_size, 0)); + + int num_blocks = max_blocks_per_sm * num_sms; + + // Create Streams + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + + // Allocate and initialize data + + // Alocate the host input buffer, and two device buffers + unsigned int* input_buffer = + reinterpret_cast(malloc(sizeof(unsigned int) * width)); + for (int i = 0; i < width; i++) { + input_buffer[i] = i; + } + + unsigned int *dev_mem_1, *host_mem_1; + host_mem_1 = reinterpret_cast(malloc(sizeof(unsigned int) * width)); + HIP_CHECK(hipMalloc(&dev_mem_1, sizeof(unsigned int) * width)); + HIP_CHECK(hipMemcpyAsync(dev_mem_1, input_buffer, sizeof(unsigned int) * width, + hipMemcpyHostToDevice, stream)); + + unsigned int *dev_mem_2, *host_mem_2; + host_mem_2 = reinterpret_cast(malloc(sizeof(unsigned int) * width)); + HIP_CHECK(hipMalloc(&dev_mem_2, sizeof(unsigned int) * width)); + HIP_CHECK(hipMemsetAsync(dev_mem_2, 0, width * sizeof(unsigned int), stream)); + + // Launch the kernels + INFO("Launching a cooperative kernel with " << num_blocks << " blocks, each with " << warp_size + << " threads"); + + void* coop_params[4]; + coop_params[0] = reinterpret_cast(&dev_mem_1); + coop_params[1] = reinterpret_cast(&dev_mem_2); + coop_params[2] = reinterpret_cast(&loops); + coop_params[3] = reinterpret_cast(&width); + HIP_CHECK(hipLaunchCooperativeKernel(coop_kernel, num_blocks, warp_size, coop_params, 0, stream)); + + // Read back the buffers and print out their data + HIP_CHECK(hipMemcpyAsync(host_mem_1, dev_mem_1, sizeof(unsigned int) * width, + hipMemcpyDeviceToHost, stream)); + HIP_CHECK(hipMemcpyAsync(host_mem_2, dev_mem_2, sizeof(unsigned int) * width, + hipMemcpyDeviceToHost, stream)); + + HIP_CHECK(hipStreamSynchronize(stream)); + + verify_coop_buffers(input_buffer, host_mem_1, host_mem_2, loops, width); + + HIP_CHECK(hipStreamDestroy(stream)); + HIP_CHECK(hipFree(dev_mem_1)); + HIP_CHECK(hipFree(dev_mem_2)); + free(input_buffer); + free(host_mem_1); + free(host_mem_2); +} + +TEST_CASE("Unit_hipCGGridGroupType_Barrier") { + const auto device = GENERATE(range(0, HipTest::getDeviceCount())); + HIP_CHECK(hipSetDevice(device)); + + hipDeviceProp_t device_properties; + + HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); + + if (!device_properties.cooperativeLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + + uint32_t loops = GENERATE(1, 2, 3, 4); + uint32_t warps = GENERATE(4, 8, 16, 32); + uint32_t block_size = 1; + + // Test whether the requested size will fit on the GPU + int max_blocks_per_sm; + int warp_size = device_properties.warpSize; + int num_sms = device_properties.multiProcessorCount; + + int num_threads_in_block = block_size * warp_size; + + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + // Calculate the device occupancy to know how many blocks can be run. + HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, test_kernel_used, + num_threads_in_block, 0)); + + int requested_blocks = warps / block_size; + if (requested_blocks > max_blocks_per_sm * num_sms) { + INFO("Too many blocks requested!"); + REQUIRE(false); + } + + // Each block will output a single value per loop. + uint32_t total_buffer_len = requested_blocks * loops; + + // Alocate the buffer that will hold the kernel's output, and which will + // also be used to globally synchronize during GWS initialization + unsigned int* host_buffer = + reinterpret_cast(calloc(total_buffer_len, sizeof(unsigned int))); + + unsigned int* kernel_buffer; + HIP_CHECK(hipMalloc(&kernel_buffer, sizeof(unsigned int) * total_buffer_len)); + HIP_CHECK(hipMemcpy(kernel_buffer, host_buffer, sizeof(unsigned int) * total_buffer_len, + hipMemcpyHostToDevice)); + + unsigned int* kernel_atomic; + HIP_CHECK(hipMalloc(&kernel_atomic, sizeof(unsigned int))); + HIP_CHECK(hipMemset(kernel_atomic, 0, sizeof(unsigned int))); + + // Launch the kernel + INFO("Launching a cooperative kernel with " << warps << " warps in " << requested_blocks + << " thread blocks"); + + void* params[3]; + params[0] = reinterpret_cast(&kernel_atomic); + params[1] = reinterpret_cast(&kernel_buffer); + params[2] = reinterpret_cast(&loops); + HIP_CHECK(hipLaunchCooperativeKernel(test_kernel_used, requested_blocks, num_threads_in_block, + params, 0, 0)); + + // Read back the buffer to host + HIP_CHECK(hipMemcpy(host_buffer, kernel_buffer, sizeof(unsigned int) * total_buffer_len, + hipMemcpyDeviceToHost)); + + verify_barrier_buffer(loops, requested_blocks, host_buffer); + + HIP_CHECK(hipFree(kernel_buffer)); + HIP_CHECK(hipFree(kernel_atomic)); + free(host_buffer); +} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupType.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupType.cc deleted file mode 100644 index 1dd2a8f3b5..0000000000 --- a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupType.cc +++ /dev/null @@ -1,240 +0,0 @@ -/* -Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 - * TEST: %t - * HIT_END - */ - -#include -#include - -#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs) -#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs) -#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs) - -using namespace cooperative_groups; -constexpr int MaxGPUs = 8; - -static __global__ -void kernel_cg_multi_grid_group_type(int* numGridsTestD, - int* gridRankTestD, - int *sizeTestD, - int *thdRankTestD, - int *isValidTestD, - int *syncTestD, - int *syncResultD) -{ - multi_grid_group mg = this_multi_grid(); - int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; - - // Test num_grids - numGridsTestD[gIdx] = mg.num_grids(); - - // Test grid_rank - gridRankTestD[gIdx] = mg.grid_rank(); - - // Test size - sizeTestD[gIdx] = mg.size(); - - // Test thread_rank - thdRankTestD[gIdx] = mg.thread_rank(); - - // Test is_valid - isValidTestD[gIdx] = mg.is_valid(); - - // Test sync - // - // Eech thread assign 1 to their respective location - syncTestD[gIdx] = 1; - // Grid level sync - this_grid().sync(); - // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction - if (blockIdx.x == 0 && threadIdx.x == 0) { - for (uint i = 1; i < gridDim.x * blockDim.x; ++i) { - syncTestD[0] += syncTestD[i]; - } - syncResultD[mg.grid_rank() + 1] = syncTestD[0]; - } - // multi-grid level sync - mg.sync(); - // grid (gpu) 0 does final reduction across all grids (gpus) - if (mg.grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) { - syncResultD[0] = 0; - for (uint i = 1; i <= mg.num_grids(); ++i) { - syncResultD[0] += syncResultD[i]; - } - } -} - -static void test_cg_multi_grid_group_type(int blockSize, int nGpu) -{ - // Create a stream each device - hipStream_t stream[MaxGPUs]; - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device - HIPCHECK(hipStreamCreate(&stream[i])); - } - - // Allocate host and device memory - int nBytes = sizeof(int) * 2 * blockSize; - int *numGridsTestD[MaxGPUs], *numGridsTestH[MaxGPUs]; - int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs]; - int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs]; - int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs]; - int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs]; - int *syncTestD[MaxGPUs], *syncResultD; - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - - HIPCHECK(hipMalloc(&numGridsTestD[i], nBytes)); - HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes)); - HIPCHECK(hipMalloc(&sizeTestD[i], nBytes)); - HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes)); - HIPCHECK(hipMalloc(&isValidTestD[i], nBytes)); - HIPCHECK(hipMalloc(&syncTestD[i], nBytes)); - - HIPCHECK(hipHostMalloc(&numGridsTestH[i], nBytes)); - HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes)); - HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes)); - HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes)); - HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes)); - - if (i == 0) { - HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent)); - } - } - - // Launch Kernel - constexpr int NumKernelArgs = 7; - hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu]; - void* args[MaxGPUs * NumKernelArgs]; - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - - args[i * NumKernelArgs] = &numGridsTestD[i]; - args[i * NumKernelArgs + 1] = &gridRankTestD[i]; - args[i * NumKernelArgs + 2] = &sizeTestD[i]; - args[i * NumKernelArgs + 3] = &thdRankTestD[i]; - args[i * NumKernelArgs + 4] = &isValidTestD[i]; - args[i * NumKernelArgs + 5] = &syncTestD[i]; - args[i * NumKernelArgs + 6] = &syncResultD; - - launchParamsList[i].func = reinterpret_cast(kernel_cg_multi_grid_group_type); - launchParamsList[i].gridDim = 2; - launchParamsList[i].blockDim = blockSize; - launchParamsList[i].sharedMem = 0; - launchParamsList[i].stream = stream[i]; - launchParamsList[i].args = &args[i * NumKernelArgs]; - } - HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0)); - - // Copy result from device to host - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - HIPCHECK(hipMemcpy(numGridsTestH[i], numGridsTestD[i], nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost)); - } - - // Validate results - int gridsSeen[MaxGPUs]; - for (int i = 0; i < nGpu; ++i) { - for (int j = 0; j < 2 * blockSize; ++j) { - ASSERT_EQUAL(numGridsTestH[i][j], nGpu); - ASSERT_GE(gridRankTestH[i][j], 0); - ASSERT_LE(gridRankTestH[i][j], nGpu-1); - ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]); - ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize); - int gridRank = gridRankTestH[i][j]; - ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j); - ASSERT_EQUAL(isValidTestH[i][j], 1); - } - ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize); - - // Validate uniqueness property of grid rank - gridsSeen[i] = gridRankTestH[i][0]; - for (int k = 0; k < i; ++k) { - if (gridsSeen[k] == gridsSeen[i]) { - assert(false && "Grid rank in multi-gpu setup should be unique"); - } - } - } - ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize); - - // Free host and device memory - delete [] launchParamsList; - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - - HIPCHECK(hipFree(numGridsTestD[i])); - HIPCHECK(hipFree(gridRankTestD[i])); - HIPCHECK(hipFree(sizeTestD[i])); - HIPCHECK(hipFree(thdRankTestD[i])); - HIPCHECK(hipFree(isValidTestD[i])); - HIPCHECK(hipFree(syncTestD[i])); - - if (i == 0) { - HIPCHECK(hipHostFree(syncResultD)); - } - HIPCHECK(hipHostFree(numGridsTestH[i])); - HIPCHECK(hipHostFree(gridRankTestH[i])); - HIPCHECK(hipHostFree(sizeTestH[i])); - HIPCHECK(hipHostFree(thdRankTestH[i])); - HIPCHECK(hipHostFree(isValidTestH[i])); - } -} - -TEST_CASE("Unit_hipCGMultiGridGroupType") { - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - nGpu = min(nGpu, MaxGPUs); - - // Set `maxThreadsPerBlock` by taking minimum among all available devices - int maxThreadsPerBlock = INT_MAX; - hipDeviceProp_t deviceProperties; - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipGetDeviceProperties(&deviceProperties, i)); - if (!deviceProperties.cooperativeMultiDeviceLaunch) { - HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); - return; - } - maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock); - } - - // Test for blockSizes in powers of 2 - for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) { - test_cg_multi_grid_group_type(blockSize, nGpu); - } - - // Test for random blockSizes, but the sequence is the same every execution - srand(0); - for (int i = 0; i < 10; i++) { - // Test fails for 0 thread per block - test_cg_multi_grid_group_type(max(2, rand() % maxThreadsPerBlock), nGpu); - } -} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupTypeViaBaseType.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupTypeViaBaseType.cc deleted file mode 100644 index 408f3b0075..0000000000 --- a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupTypeViaBaseType.cc +++ /dev/null @@ -1,234 +0,0 @@ -/* -Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -D_CG_ABI_EXPERIMENTAL -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 - * TEST: %t - * HIT_END - */ - -#include -#include -#include -#include -#include - -#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs) -#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs) -#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs) - -using namespace cooperative_groups; -constexpr int MaxGPUs = 8; - -static __global__ -void kernel_cg_multi_grid_group_type_via_base_type(int *sizeTestD, - int* gridRankTestD, - int *thdRankTestD, - int *isValidTestD, - int *syncTestD, - int *syncResultD) -{ - thread_group tg = this_multi_grid(); // This can work if _CG_ABI_EXPERIMENTAL defined on Cuda - - int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; - - // Test size - sizeTestD[gIdx] = tg.size(); - - // Test thread_rank - gridRankTestD[gIdx] = this_multi_grid().grid_rank(); - thdRankTestD[gIdx] = tg.thread_rank(); - - // Test is_valid -#ifdef __HIP_PLATFORM_AMD__ - isValidTestD[gIdx] = tg.is_valid(); -#else - // Cuda has no thread_group.is_valid() - isValidTestD[gIdx] = true; -#endif - // Test sync - // - // Eech thread assign 1 to their respective location - syncTestD[gIdx] = 1; - // Grid level sync - this_grid().sync(); - // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction - if (blockIdx.x == 0 && threadIdx.x == 0) { - for (uint i = 1; i < gridDim.x * blockDim.x; ++i) { - syncTestD[0] += syncTestD[i]; - } - syncResultD[this_multi_grid().grid_rank() + 1] = syncTestD[0]; - } - // multi-grid level sync - tg.sync(); - // grid (gpu) 0 does final reduction across all grids (gpus) - if (this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) { - syncResultD[0] = 0; - for (uint i = 1; i <= this_multi_grid().num_grids(); ++i) { - syncResultD[0] += syncResultD[i]; - } - } -} - -static void test_cg_multi_grid_group_type_via_base_type(int blockSize, int nGpu) -{ - // Create a stream each device - hipStream_t stream[MaxGPUs]; - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device - HIPCHECK(hipStreamCreate(&stream[i])); - } - - // Allocate host and device memory - int nBytes = sizeof(int) * 2 * blockSize; - int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs]; - int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs]; - int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs]; - int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs]; - int *syncTestD[MaxGPUs], *syncResultD; - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - - HIPCHECK(hipMalloc(&sizeTestD[i], nBytes)); - HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes)); - HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes)); - HIPCHECK(hipMalloc(&isValidTestD[i], nBytes)); - HIPCHECK(hipMalloc(&syncTestD[i], nBytes)); - - HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes)); - HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes)); - HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes)); - HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes)); - - if (i == 0) { - HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent)); - } - } - - // Launch Kernel - constexpr int NumKernelArgs = 6; - hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu]; - void* args[MaxGPUs * NumKernelArgs]; - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - - args[i * NumKernelArgs ] = &sizeTestD[i]; - args[i * NumKernelArgs + 1] = &gridRankTestD[i]; - args[i * NumKernelArgs + 2] = &thdRankTestD[i]; - args[i * NumKernelArgs + 3] = &isValidTestD[i]; - args[i * NumKernelArgs + 4] = &syncTestD[i]; - args[i * NumKernelArgs + 5] = &syncResultD; - - launchParamsList[i].func = reinterpret_cast(kernel_cg_multi_grid_group_type_via_base_type); - launchParamsList[i].gridDim = 2; - launchParamsList[i].blockDim = blockSize; - launchParamsList[i].sharedMem = 0; - launchParamsList[i].stream = stream[i]; - launchParamsList[i].args = &args[i * NumKernelArgs]; - } - HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0)); - - // Copy result from device to host - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost)); - } - - // Validate results - int gridsSeen[MaxGPUs]; - for (int i = 0; i < nGpu; ++i) { - for (int j = 0; j < 2 * blockSize; ++j) { - ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize); - ASSERT_GE(gridRankTestH[i][j], 0); - ASSERT_LE(gridRankTestH[i][j], nGpu-1); - ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]); - int gridRank = gridRankTestH[i][j]; - ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j); - ASSERT_EQUAL(isValidTestH[i][j], 1); - } - ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize); - - // Validate uniqueness property of grid rank - gridsSeen[i] = gridRankTestH[i][0]; - for (int k = 0; k < i; ++k) { - if (gridsSeen[k] == gridsSeen[i]) { - assert (false && "Grid rank in multi-gpu setup should be unique"); - } - } - } - ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize); - - // Free host and device memory - delete [] launchParamsList; - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - - HIPCHECK(hipFree(sizeTestD[i])); - HIPCHECK(hipFree(gridRankTestD[i])); - HIPCHECK(hipFree(thdRankTestD[i])); - HIPCHECK(hipFree(isValidTestD[i])); - HIPCHECK(hipFree(syncTestD[i])); - - if (i == 0) - HIPCHECK(hipHostFree(syncResultD)); - - HIPCHECK(hipHostFree(sizeTestH[i])); - HIPCHECK(hipHostFree(gridRankTestH[i])); - HIPCHECK(hipHostFree(thdRankTestH[i])); - HIPCHECK(hipHostFree(isValidTestH[i])); - } -} - -TEST_CASE("Unit_hipCGMultiGridGroupType_BaseType") { - // Set `maxThreadsPerBlock` by taking minimum among all available devices - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - nGpu = min(nGpu, MaxGPUs); - - int maxThreadsPerBlock = INT_MAX; - hipDeviceProp_t deviceProperties; - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipGetDeviceProperties(&deviceProperties, i)); - if (!deviceProperties.cooperativeMultiDeviceLaunch) { - HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); - return; - } - maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock); - } - - // Test for blockSizes in powers of 2 - for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) { - test_cg_multi_grid_group_type_via_base_type(blockSize, nGpu); - } - - // Test for random blockSizes, but the sequence is the same every execution - srand(0); - for (int i = 0; i < 10; i++) { - // Test fails for 0 thread per block - test_cg_multi_grid_group_type_via_base_type(max(2, rand() % maxThreadsPerBlock), nGpu); - } -} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupTypeViaPublicApi.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupTypeViaPublicApi.cc deleted file mode 100644 index 3e5b97fe5a..0000000000 --- a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupTypeViaPublicApi.cc +++ /dev/null @@ -1,230 +0,0 @@ -/* -Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 -rdc=true -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 - * TEST: %t - * HIT_END - */ - -#include -#include -#include -#include -#include - -#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs) -#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs) -#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs) - -using namespace cooperative_groups; -constexpr int MaxGPUs = 8; - -static __global__ -void kernel_cg_multi_grid_group_type_via_public_api(int *sizeTestD, - int* gridRankTestD, - int *thdRankTestD, - int *isValidTestD, - int *syncTestD, - int *syncResultD) -{ - multi_grid_group mg = this_multi_grid(); - int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; - - // Test group_size api - sizeTestD[gIdx] = group_size(mg); - - // Test thread_rank api - gridRankTestD[gIdx] = this_multi_grid().grid_rank(); - thdRankTestD[gIdx] = thread_rank(mg); - - // Test is_valid api - isValidTestD[gIdx] = mg.is_valid(); - - // Test sync api - // - // Eech thread assign 1 to their respective location - syncTestD[gIdx] = 1; - // Grid level sync - sync(this_grid()); - // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction - if (blockIdx.x == 0 && threadIdx.x == 0) { - for (uint i = 1; i < gridDim.x * blockDim.x; ++i) { - syncTestD[0] += syncTestD[i]; - } - syncResultD[this_multi_grid().grid_rank() + 1] = syncTestD[0]; - } - // multi-grid level sync via public api - sync(mg); - // grid (gpu) 0 does final reduction across all grids (gpus) - if (this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) { - syncResultD[0] = 0; - for (uint i = 1; i <= this_multi_grid().num_grids(); ++i) { - syncResultD[0] += syncResultD[i]; - } - } -} - -static void test_cg_multi_grid_group_type_via_public_api(int blockSize, int nGpu) -{ - // Create a stream each device - hipStream_t stream[MaxGPUs]; - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - HIPCHECK(hipDeviceSynchronize()); // Make sure work is done on this device - HIPCHECK(hipStreamCreate(&stream[i])); - } - - // Allocate host and device memory - int nBytes = sizeof(int) * 2 * blockSize; - int *sizeTestD[MaxGPUs], *sizeTestH[MaxGPUs]; - int *gridRankTestD[MaxGPUs], *gridRankTestH[MaxGPUs]; - int *thdRankTestD[MaxGPUs], *thdRankTestH[MaxGPUs]; - int *isValidTestD[MaxGPUs], *isValidTestH[MaxGPUs]; - int *syncTestD[MaxGPUs], *syncResultD; - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - - HIPCHECK(hipMalloc(&sizeTestD[i], nBytes)); - HIPCHECK(hipMalloc(&gridRankTestD[i], nBytes)); - HIPCHECK(hipMalloc(&thdRankTestD[i], nBytes)); - HIPCHECK(hipMalloc(&isValidTestD[i], nBytes)); - HIPCHECK(hipMalloc(&syncTestD[i], nBytes)); - - HIPCHECK(hipHostMalloc(&sizeTestH[i], nBytes)); - HIPCHECK(hipHostMalloc(&gridRankTestH[i], nBytes)); - HIPCHECK(hipHostMalloc(&thdRankTestH[i], nBytes)); - HIPCHECK(hipHostMalloc(&isValidTestH[i], nBytes)); - - if (i == 0) { - HIPCHECK(hipHostMalloc(&syncResultD, sizeof(int) * (nGpu + 1), hipHostMallocCoherent)); - } - } - - // Launch Kernel - constexpr int NumKernelArgs = 6; - hipLaunchParams* launchParamsList = new hipLaunchParams[nGpu]; - void* args[MaxGPUs * NumKernelArgs]; - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - - args[i * NumKernelArgs ] = &sizeTestD[i]; - args[i * NumKernelArgs + 1] = &gridRankTestD[i]; - args[i * NumKernelArgs + 2] = &thdRankTestD[i]; - args[i * NumKernelArgs + 3] = &isValidTestD[i]; - args[i * NumKernelArgs + 4] = &syncTestD[i]; - args[i * NumKernelArgs + 5] = &syncResultD; - - launchParamsList[i].func = reinterpret_cast(kernel_cg_multi_grid_group_type_via_public_api); - launchParamsList[i].gridDim = 2; - launchParamsList[i].blockDim = blockSize; - launchParamsList[i].sharedMem = 0; - launchParamsList[i].stream = stream[i]; - launchParamsList[i].args = &args[i * NumKernelArgs]; - } - HIPCHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, nGpu, 0)); - - // Copy result from device to host - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - - HIPCHECK(hipMemcpy(sizeTestH[i], sizeTestD[i], nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(gridRankTestH[i], gridRankTestD[i], nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(thdRankTestH[i], thdRankTestD[i], nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(isValidTestH[i], isValidTestD[i], nBytes, hipMemcpyDeviceToHost)); - } - - // Validate results - int gridsSeen[MaxGPUs]; - for (int i = 0; i < nGpu; ++i) { - for (int j = 0; j < 2 * blockSize; ++j) { - ASSERT_EQUAL(sizeTestH[i][j], nGpu * 2 * blockSize); - ASSERT_GE(gridRankTestH[i][j], 0); - ASSERT_LE(gridRankTestH[i][j], nGpu-1); - ASSERT_EQUAL(gridRankTestH[i][j], gridRankTestH[i][0]); - int gridRank = gridRankTestH[i][j]; - ASSERT_EQUAL(thdRankTestH[i][j], (gridRank * 2 * blockSize) + j); - ASSERT_EQUAL(isValidTestH[i][j], 1); - } - ASSERT_EQUAL(syncResultD[i+1], 2 * blockSize); - - // Validate uniqueness property of grid rank - gridsSeen[i] = gridRankTestH[i][0]; - for (int k = 0; k < i; ++k) { - if (gridsSeen[k] == gridsSeen[i]) { - assert (false && "Grid rank in multi-gpu setup should be unique"); - } - } - } - ASSERT_EQUAL(syncResultD[0], nGpu * 2 * blockSize); - - // Free host and device memory - delete [] launchParamsList; - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipSetDevice(i)); - - HIPCHECK(hipFree(sizeTestD[i])); - HIPCHECK(hipFree(gridRankTestD[i])); - HIPCHECK(hipFree(thdRankTestD[i])); - HIPCHECK(hipFree(isValidTestD[i])); - HIPCHECK(hipFree(syncTestD[i])); - - if (i == 0) - HIPCHECK(hipHostFree(syncResultD)); - - HIPCHECK(hipHostFree(sizeTestH[i])); - HIPCHECK(hipHostFree(gridRankTestH[i])); - HIPCHECK(hipHostFree(thdRankTestH[i])); - HIPCHECK(hipHostFree(isValidTestH[i])); - } -} - -TEST_CASE("Unit_hipCGMultiGridGroupType_PublicApi") { - // Set `maxThreadsPerBlock` by taking minimum among all available devices - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - nGpu = min(nGpu, MaxGPUs); - - int maxThreadsPerBlock = INT_MAX; - hipDeviceProp_t deviceProperties; - for (int i = 0; i < nGpu; i++) { - HIPCHECK(hipGetDeviceProperties(&deviceProperties, i)); - if (!deviceProperties.cooperativeMultiDeviceLaunch) { - HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); - return; - } - maxThreadsPerBlock = min(maxThreadsPerBlock, deviceProperties.maxThreadsPerBlock); - } - - // Test for blockSizes in powers of 2 - for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) { - test_cg_multi_grid_group_type_via_public_api(blockSize, nGpu); - } - - // Test for random blockSizes, but the sequence is the same every execution - srand(0); - for (int i = 0; i < 10; i++) { - // Test fails for 0 thread per block - test_cg_multi_grid_group_type_via_public_api(max(2, rand() % maxThreadsPerBlock), nGpu); - } -} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupType_old.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupType_old.cc new file mode 100644 index 0000000000..f5a8a14bc7 --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGMultiGridGroupType_old.cc @@ -0,0 +1,638 @@ +/* +Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include + +#include "hip_cg_common.hh" + +namespace cg = cooperative_groups; + +static __global__ void kernel_cg_multi_grid_group_type(int* grid_rank_dev, int* size_dev, + int* thd_rank_dev, int* is_valid_dev, + int* sync_dev, int* sync_result, + int* num_grids_dev) { + cg::multi_grid_group mg = cg::this_multi_grid(); + int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; + + // Test num_grids + num_grids_dev[gIdx] = mg.num_grids(); + + // Test grid_rank + grid_rank_dev[gIdx] = mg.grid_rank(); + + // Test size + size_dev[gIdx] = mg.size(); + + // Test thread_rank + thd_rank_dev[gIdx] = mg.thread_rank(); + + // Test is_valid + is_valid_dev[gIdx] = mg.is_valid(); + + // Test sync + // + // Eech thread assign 1 to their respective location + sync_dev[gIdx] = 1; + // Grid level sync + cg::this_grid().sync(); + // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction + if (blockIdx.x == 0 && threadIdx.x == 0) { + for (uint i = 1; i < gridDim.x * blockDim.x; ++i) { + sync_dev[0] += sync_dev[i]; + } + sync_result[mg.grid_rank() + 1] = sync_dev[0]; + } + // multi-grid level sync + mg.sync(); + // grid (gpu) 0 does final reduction across all grids (gpus) + if (mg.grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) { + sync_result[0] = 0; + for (uint i = 1; i <= mg.num_grids(); ++i) { + sync_result[0] += sync_result[i]; + } + } +} + +static __global__ void kernel_cg_multi_grid_group_type_via_base_type( + int* grid_rank_dev, int* size_dev, int* thd_rank_dev, int* is_valid_dev, int* sync_dev, + int* sync_result) { + cg::thread_group tg = + cg::this_multi_grid(); // This can work if _CG_ABI_EXPERIMENTAL defined on Cuda + + int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; + + // Test size + size_dev[gIdx] = tg.size(); + + // Test thread_rank + grid_rank_dev[gIdx] = cg::this_multi_grid().grid_rank(); + thd_rank_dev[gIdx] = tg.thread_rank(); + + // Test is_valid +#ifdef __HIP_PLATFORM_AMD__ + is_valid_dev[gIdx] = tg.is_valid(); +#else + // Cuda has no thread_group.is_valid() + is_valid_dev[gIdx] = true; +#endif + // Test sync + // + // Eech thread assign 1 to their respective location + sync_dev[gIdx] = 1; + // Grid level sync + cg::this_grid().sync(); + // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction + if (blockIdx.x == 0 && threadIdx.x == 0) { + for (uint i = 1; i < gridDim.x * blockDim.x; ++i) { + sync_dev[0] += sync_dev[i]; + } + sync_result[cg::this_multi_grid().grid_rank() + 1] = sync_dev[0]; + } + // multi-grid level sync + tg.sync(); + // grid (gpu) 0 does final reduction across all grids (gpus) + if (cg::this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) { + sync_result[0] = 0; + for (uint i = 1; i <= cg::this_multi_grid().num_grids(); ++i) { + sync_result[0] += sync_result[i]; + } + } +} + +static __global__ void kernel_cg_multi_grid_group_type_via_public_api( + int* grid_rank_dev, int* size_dev, int* thd_rank_dev, int* is_valid_dev, int* sync_dev, + int* sync_result) { + cg::multi_grid_group mg = cg::this_multi_grid(); + int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; + + // Test group_size api + size_dev[gIdx] = cg::group_size(mg); + + // Test thread_rank api + grid_rank_dev[gIdx] = cg::this_multi_grid().grid_rank(); + thd_rank_dev[gIdx] = cg::thread_rank(mg); + + // Test is_valid api + is_valid_dev[gIdx] = mg.is_valid(); + + // Test sync api + // + // Eech thread assign 1 to their respective location + sync_dev[gIdx] = 1; + // Grid level sync + cg::sync(cg::this_grid()); + // Thread 0 from work-group 0 of current grid (gpu) does grid level reduction + if (blockIdx.x == 0 && threadIdx.x == 0) { + for (uint i = 1; i < gridDim.x * blockDim.x; ++i) { + sync_dev[0] += sync_dev[i]; + } + sync_result[cg::this_multi_grid().grid_rank() + 1] = sync_dev[0]; + } + // multi-grid level sync via public api + cg::sync(mg); + // grid (gpu) 0 does final reduction across all grids (gpus) + if (cg::this_multi_grid().grid_rank() == 0 && blockIdx.x == 0 && threadIdx.x == 0) { + sync_result[0] = 0; + for (uint i = 1; i <= cg::this_multi_grid().num_grids(); ++i) { + sync_result[0] += sync_result[i]; + } + } +} + +static __global__ void test_kernel(unsigned int* atomic_val, unsigned int* global_array, + unsigned int* array, uint32_t loops) { + cg::grid_group grid = cg::this_grid(); + cg::multi_grid_group mgrid = cg::this_multi_grid(); + unsigned rank = grid.thread_rank(); + unsigned global_rank = mgrid.thread_rank(); + + int offset = blockIdx.x; + for (int i = 0; i < loops; i++) { + // Make the last thread run way behind everyone else. + // If the grid barrier below fails, then the other threads may hit the + // atomicInc instruction many times before the last thread ever gets + // to it. + // As such, without the barrier, the last array entry will eventually + // contain a very large value, defined by however many times the other + // wavefronts make it through this loop. + // If the barrier works, then it will likely contain some number + // near "total number of blocks". It will be the last wavefront to + // reach the atomicInc, but everyone will have only hit the atomic once. + if (rank == (grid.size() - 1)) { + long long time_diff = 0; + long long last_clock = clock64(); + do { + long long cur_clock = clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while (time_diff < 1000000); + } + if (threadIdx.x == 0) { + array[offset] = atomicInc(atomic_val, UINT_MAX); + } + grid.sync(); + + // Make the last thread in the entire multi-grid run way behind + // everyone else. + // If the mgrid barrier below fails, then the two global_array entries + // will end up being out of sync, because the intermingling of adds + // and multiplies will not be aligned between to the two GPUs. + if (global_rank == (mgrid.size() - 1)) { + long long time_diff = 0; + long long last_clock = clock64(); + do { + long long cur_clock = clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while (time_diff < 1000000); + } + // During even iterations, add into your own array entry + // During odd iterations, add into your partner's array entry + unsigned grid_rank = mgrid.grid_rank(); + unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids(); + if (rank == (grid.size() - 1)) { + if (i % mgrid.num_grids() == 0) { + global_array[grid_rank] += 2; + } else { + global_array[inter_gpu_offset] *= 2; + } + } + mgrid.sync(); + offset += gridDim.x; + } +} + +__global__ void test_kernel_gfx11(unsigned int* atomic_val, unsigned int* global_array, + unsigned int* array, uint32_t loops) { +#if HT_AMD + cg::grid_group grid = cg::this_grid(); + cg::multi_grid_group mgrid = cg::this_multi_grid(); + unsigned rank = grid.thread_rank(); + unsigned global_rank = mgrid.thread_rank(); + + int offset = blockIdx.x; + for (int i = 0; i < loops; i++) { + // Make the last thread run way behind everyone else. + // If the grid barrier below fails, then the other threads may hit the + // atomicInc instruction many times before the last thread ever gets + // to it. + // As such, without the barrier, the last array entry will eventually + // contain a very large value, defined by however many times the other + // wavefronts make it through this loop. + // If the barrier works, then it will likely contain some number + // near "total number of blocks". It will be the last wavefront to + // reach the atomicInc, but everyone will have only hit the atomic once. + if (rank == (grid.size() - 1)) { + long long time_diff = 0; + long long last_clock = wall_clock64(); + do { + long long cur_clock = wall_clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while (time_diff < 1000000); + } + if (threadIdx.x == 0) { + array[offset] = atomicInc(atomic_val, UINT_MAX); + } + grid.sync(); + + // Make the last thread in the entire multi-grid run way behind + // everyone else. + // If the mgrid barrier below fails, then the two global_array entries + // will end up being out of sync, because the intermingling of adds + // and multiplies will not be aligned between to the two GPUs. + if (global_rank == (mgrid.size() - 1)) { + long long time_diff = 0; + long long last_clock = wall_clock64(); + do { + long long cur_clock = wall_clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while (time_diff < 1000000); + } + // During even iterations, add into your own array entry + // During odd iterations, add into your partner's array entry + unsigned grid_rank = mgrid.grid_rank(); + unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids(); + if (rank == (grid.size() - 1)) { + if (i % mgrid.num_grids() == 0) { + global_array[grid_rank] += 2; + } else { + global_array[inter_gpu_offset] *= 2; + } + } + mgrid.sync(); + offset += gridDim.x; + } +#endif +} + +static void verify_barrier_buffer(unsigned int loops, unsigned int warps, unsigned int* host_buffer, + unsigned int num_devs) { + unsigned int max_in_this_loop = 0; + for (unsigned int i = 0; i < loops; i++) { + max_in_this_loop += (warps * num_devs); + for (unsigned int j = 0; j < warps; j++) { + REQUIRE(host_buffer[i * warps + j] <= max_in_this_loop); + } + } +} + +static void verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) { + unsigned int desired_val = 0; + for (int i = 0; i < loops; i++) { + if (i % 2 == 0) { + desired_val += 2; + } else { + desired_val *= 2; + } + } + + REQUIRE(array_val == desired_val); +} + +template +static void test_cg_multi_grid_group_type(F kernel_func, int num_devices, int block_size, + bool specific_api_test) { + // Create a stream each device + hipStream_t stream[MaxGPUs]; + for (int i = 0; i < num_devices; i++) { + HIP_CHECK(hipSetDevice(i)); + HIP_CHECK(hipDeviceSynchronize()); // Make sure work is done on this device + HIP_CHECK(hipStreamCreate(&stream[i])); + } + + // Allocate host and device memory + int num_bytes = sizeof(int) * 2 * block_size; + int *num_grids_dev[MaxGPUs], *num_grids_host[MaxGPUs]; + int *grid_rank_dev[MaxGPUs], *grid_rank_host[MaxGPUs]; + int *size_dev[MaxGPUs], *size_host[MaxGPUs]; + int *thd_rank_dev[MaxGPUs], *thd_rank_host[MaxGPUs]; + int *is_valid_dev[MaxGPUs], *is_valid_host[MaxGPUs]; + int *sync_dev[MaxGPUs], *sync_result; + for (int i = 0; i < num_devices; i++) { + HIP_CHECK(hipSetDevice(i)); + + if (specific_api_test) { + HIP_CHECK(hipMalloc(&num_grids_dev[i], num_bytes)); + HIP_CHECK(hipHostMalloc(&num_grids_host[i], num_bytes)); + } + + HIP_CHECK(hipMalloc(&grid_rank_dev[i], num_bytes)); + HIP_CHECK(hipMalloc(&size_dev[i], num_bytes)); + HIP_CHECK(hipMalloc(&thd_rank_dev[i], num_bytes)); + HIP_CHECK(hipMalloc(&is_valid_dev[i], num_bytes)); + HIP_CHECK(hipMalloc(&sync_dev[i], num_bytes)); + + HIP_CHECK(hipHostMalloc(&grid_rank_host[i], num_bytes)); + HIP_CHECK(hipHostMalloc(&size_host[i], num_bytes)); + HIP_CHECK(hipHostMalloc(&thd_rank_host[i], num_bytes)); + HIP_CHECK(hipHostMalloc(&is_valid_host[i], num_bytes)); + + if (i == 0) { + HIP_CHECK( + hipHostMalloc(&sync_result, sizeof(int) * (num_devices + 1), hipHostMallocCoherent)); + } + } + + // Launch Kernel + int NumKernelArgs = 6; + if (specific_api_test) { + NumKernelArgs = 7; + } + hipLaunchParams* launchParamsList = new hipLaunchParams[num_devices]; + std::vector args(MaxGPUs * NumKernelArgs); + for (int i = 0; i < num_devices; i++) { + HIP_CHECK(hipSetDevice(i)); + + args[i * NumKernelArgs] = &grid_rank_dev[i]; + args[i * NumKernelArgs + 1] = &size_dev[i]; + args[i * NumKernelArgs + 2] = &thd_rank_dev[i]; + args[i * NumKernelArgs + 3] = &is_valid_dev[i]; + args[i * NumKernelArgs + 4] = &sync_dev[i]; + args[i * NumKernelArgs + 5] = &sync_result; + if (specific_api_test) { + args[i * NumKernelArgs + 6] = &num_grids_dev[i]; + } + + launchParamsList[i].func = reinterpret_cast(kernel_func); + launchParamsList[i].gridDim = 2; + launchParamsList[i].blockDim = block_size; + launchParamsList[i].sharedMem = 0; + launchParamsList[i].stream = stream[i]; + launchParamsList[i].args = &args[i * NumKernelArgs]; + } + HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(launchParamsList, num_devices, 0)); + + // Copy result from device to host + for (int i = 0; i < num_devices; i++) { + HIP_CHECK(hipSetDevice(i)); + if (specific_api_test) { + HIP_CHECK(hipMemcpy(num_grids_host[i], num_grids_dev[i], num_bytes, hipMemcpyDeviceToHost)); + } + HIP_CHECK(hipMemcpy(grid_rank_host[i], grid_rank_dev[i], num_bytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(size_host[i], size_dev[i], num_bytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(thd_rank_host[i], thd_rank_dev[i], num_bytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(is_valid_host[i], is_valid_dev[i], num_bytes, hipMemcpyDeviceToHost)); + } + + // Validate results + int grids_seen[MaxGPUs]; + for (int i = 0; i < num_devices; ++i) { + for (int j = 0; j < 2 * block_size; ++j) { + if (specific_api_test) { + ASSERT_EQUAL(num_grids_host[i][j], num_devices); + } + ASSERT_GE(grid_rank_host[i][j], 0); + ASSERT_LE(grid_rank_host[i][j], num_devices - 1); + ASSERT_EQUAL(grid_rank_host[i][j], grid_rank_host[i][0]); + ASSERT_EQUAL(size_host[i][j], num_devices * 2 * block_size); + int gridRank = grid_rank_host[i][j]; + ASSERT_EQUAL(thd_rank_host[i][j], (gridRank * 2 * block_size) + j); + ASSERT_EQUAL(is_valid_host[i][j], 1); + } + ASSERT_EQUAL(sync_result[i + 1], 2 * block_size); + + // Validate uniqueness property of grid rank + grids_seen[i] = grid_rank_host[i][0]; + for (int k = 0; k < i; ++k) { + INFO("Grid rank in multi-gpu setup should be unique"); + REQUIRE(grids_seen[k] != grids_seen[i]); + } + } + ASSERT_EQUAL(sync_result[0], num_devices * 2 * block_size); + + // Free host and device memory + delete[] launchParamsList; + for (int i = 0; i < num_devices; i++) { + HIP_CHECK(hipSetDevice(i)); + + if (specific_api_test) { + HIP_CHECK(hipFree(num_grids_dev[i])); + HIP_CHECK(hipHostFree(num_grids_host[i])); + } + + HIP_CHECK(hipFree(grid_rank_dev[i])); + HIP_CHECK(hipFree(size_dev[i])); + HIP_CHECK(hipFree(thd_rank_dev[i])); + HIP_CHECK(hipFree(is_valid_dev[i])); + HIP_CHECK(hipFree(sync_dev[i])); + + if (i == 0) { + HIP_CHECK(hipHostFree(sync_result)); + } + HIP_CHECK(hipHostFree(grid_rank_host[i])); + HIP_CHECK(hipHostFree(size_host[i])); + HIP_CHECK(hipHostFree(thd_rank_host[i])); + HIP_CHECK(hipHostFree(is_valid_host[i])); + } +} + +TEST_CASE("Unit_hipCGMultiGridGroupType_Basic") { + int num_devices = 0; + HIP_CHECK(hipGetDeviceCount(&num_devices)); + num_devices = min(num_devices, MaxGPUs); + + // Set `max_threads_per_blk` by taking minimum among all available devices + int max_threads_per_blk = INT_MAX; + hipDeviceProp_t device_properties; + for (int i = 0; i < num_devices; i++) { + HIP_CHECK(hipGetDeviceProperties(&device_properties, i)); + if (!device_properties.cooperativeMultiDeviceLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + max_threads_per_blk = min(max_threads_per_blk, device_properties.maxThreadsPerBlock); + } + + void* (*kernel_func)(void); + bool specific_api_test = false; + + SECTION("Default multi grid group API test") { + kernel_func = reinterpret_cast(kernel_cg_multi_grid_group_type); + specific_api_test = true; + } + + SECTION("Base type multi grid group API test") { + kernel_func = reinterpret_cast(kernel_cg_multi_grid_group_type_via_base_type); + } + + SECTION("Public API multi grid group test") { + kernel_func = reinterpret_cast(kernel_cg_multi_grid_group_type_via_public_api); + } + + // Test for blockSizes in powers of 2 + for (int block_size = 2; block_size <= max_threads_per_blk; block_size = block_size * 2) { + test_cg_multi_grid_group_type(kernel_func, num_devices, block_size, specific_api_test); + } + + // Test for random blockSizes, but the sequence is the same every execution + srand(0); + for (int i = 0; i < 10; i++) { + // Test fails for 0 thread per block + test_cg_multi_grid_group_type(kernel_func, num_devices, max(2, rand() % max_threads_per_blk), + specific_api_test); + } +} + +TEST_CASE("Unit_hipCGMultiGridGroupType_Barrier") { + int num_devices = 0; + uint32_t loops = GENERATE(1, 2, 3, 4); + uint32_t warps = GENERATE(4, 8, 16, 32); + uint32_t block_size = 1; + + HIP_CHECK(hipGetDeviceCount(&num_devices)); + if (num_devices < 2) { + HipTest::HIP_SKIP_TEST("Device number is < 2"); + return; + } + + std::vector device_properties(num_devices); + for (int i = 0; i < num_devices; i++) { + HIP_CHECK(hipGetDeviceProperties(&device_properties[i], i)); + if (!device_properties[i].cooperativeMultiDeviceLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + } + + // Test whether the requested size will fit on the GPU + std::vector warp_sizes(num_devices); + std::vector num_sms(num_devices); + int warp_size = INT_MAX; + int num_sm = INT_MAX; + for (int i = 0; i < num_devices; i++) { + warp_sizes[i] = device_properties[i].warpSize; + if (warp_sizes[i] < warp_size) { + warp_size = warp_sizes[i]; + } + num_sms[i] = device_properties[i].multiProcessorCount; + if (num_sms[i] < num_sm) { + num_sm = num_sms[i]; + } + } + + int num_threads_in_block = block_size * warp_size; + + // Calculate the device occupancy to know how many blocks can be run. + std::vector max_blocks_per_sm_arr(num_devices); + int max_blocks_per_sm = INT_MAX; + for (int i = 0; i < num_devices; i++) { + HIP_CHECK(hipSetDevice(i)); + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_blocks_per_sm_arr[i], test_kernel_used, num_threads_in_block, 0)); + if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) { + max_blocks_per_sm = max_blocks_per_sm_arr[i]; + } + } + + int requested_blocks = warps / block_size; + + // Each block will output a single value per loop. + uint32_t total_buffer_len = requested_blocks * loops; + + // Alocate the buffer that will hold the kernel's output, and which will + // also be used to globally synchronize during GWS initialization + std::vector host_buffer(num_devices); + std::vector kernel_buffer(num_devices); + std::vector kernel_atomic(num_devices); + std::vector streams(num_devices); + for (int i = 0; i < num_devices; i++) { + host_buffer[i] = + reinterpret_cast(calloc(total_buffer_len, sizeof(unsigned int))); + HIP_CHECK(hipSetDevice(i)); + HIP_CHECK(hipMalloc(&kernel_buffer[i], sizeof(unsigned int) * total_buffer_len)); + HIP_CHECK(hipMemcpy(kernel_buffer[i], host_buffer[i], sizeof(unsigned int) * total_buffer_len, + hipMemcpyHostToDevice)); + HIP_CHECK(hipMalloc(&kernel_atomic[i], sizeof(unsigned int))); + HIP_CHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int))); + HIP_CHECK(hipStreamCreate(&streams[i])); + } + + // Single kernel atomic shared between both devices; put it on the host + unsigned int* global_array; + HIP_CHECK(hipHostMalloc(&global_array, sizeof(unsigned int) * num_devices)); + HIP_CHECK(hipMemset(global_array, 0, num_devices * sizeof(unsigned int))); + + // Launch the kernels + INFO("Launching a cooperative kernel with " << warps << " warps in " << requested_blocks + << " thread blocks"); + + std::vector> dev_params(num_devices, std::vector(4, nullptr)); + std::vector md_params(num_devices); + for (int i = 0; i < num_devices; i++) { + HIP_CHECK(hipSetDevice(i)); + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + dev_params[i][0] = reinterpret_cast(&kernel_atomic[i]); + dev_params[i][1] = reinterpret_cast(&global_array); + dev_params[i][2] = reinterpret_cast(&kernel_buffer[i]); + dev_params[i][3] = reinterpret_cast(&loops); + md_params[i].func = reinterpret_cast(test_kernel_used); + md_params[i].gridDim = requested_blocks; + md_params[i].blockDim = num_threads_in_block; + md_params[i].sharedMem = 0; + md_params[i].stream = streams[i]; + md_params[i].args = dev_params[i].data(); + } + + HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params.data(), num_devices, 0)); + HIP_CHECK(hipDeviceSynchronize()); + + // Read back the buffer to host + for (int dev = 0; dev < num_devices; dev++) { + HIP_CHECK(hipMemcpy(host_buffer[dev], kernel_buffer[dev], + sizeof(unsigned int) * total_buffer_len, hipMemcpyDeviceToHost)); + } + + for (unsigned int dev = 0; dev < num_devices; dev++) { + verify_barrier_buffer(loops, requested_blocks, host_buffer[dev], num_devices); + } + + for (int dev = 0; dev < num_devices; dev++) { + verify_multi_gpu_buffer(loops, global_array[dev]); + } + + HIP_CHECK(hipHostFree(global_array)); + for (int k = 0; k < num_devices; ++k) { + HIP_CHECK(hipFree(kernel_buffer[k])); + HIP_CHECK(hipFree(kernel_atomic[k])); + HIP_CHECK(hipStreamDestroy(streams[k])); + free(host_buffer[k]); + } +} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTileTypeShfl_old.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTileTypeShfl_old.cc new file mode 100644 index 0000000000..14dd68116f --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTileTypeShfl_old.cc @@ -0,0 +1,198 @@ +/* +Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include + +#include "hip_cg_common.hh" + +namespace cg = cooperative_groups; + +enum class TiledGroupShflTests { shflDown, shflXor, shflUp }; + +template +__device__ int reduction_kernel_shfl_down(cg::thread_block_tile const& g, + volatile int val) { + int sz = g.size(); + + for (int i = sz / 2; i > 0; i >>= 1) { + val += g.shfl_down(val, i); + } + + // Choose the 0'th indexed thread that holds the reduction value to return + if (g.thread_rank() == 0) { + return val; + } + // Rest of the threads return no useful values + else { + return -1; + } +} + +template +__device__ int reduction_kernel_shfl_xor(cg::thread_block_tile const& g, int val) { + int sz = g.size(); + + for (int i = sz / 2; i > 0; i >>= 1) { + val += g.shfl_xor(val, i); + } + + // Choose the 0'th indexed thread that holds the reduction value to return + if (g.thread_rank() == 0) { + return val; + } + // Rest of the threads return no useful values + else { + return -1; + } +} + +template +__device__ int prefix_sum_kernel(cg::thread_block_tile const& g, volatile int val) { + int sz = g.size(); +#pragma unroll + for (int i = 1; i < sz; i <<= 1) { + int temp = g.shfl_up(val, i); + + if (g.thread_rank() >= i) { + val += temp; + } + } + return val; +} + +template +static __global__ void kernel_cg_group_partition_static(int* result, + TiledGroupShflTests shfl_test) { + cg::thread_block thread_block_CG_ty = cg::this_thread_block(); + int input, output_sum; + + // Choose a leader thread to print the results + if (thread_block_CG_ty.thread_rank() == 0) { + printf(" Creating %d groups, of tile size %d threads:\n\n", + (int)thread_block_CG_ty.size() / tile_size, tile_size); + } + + thread_block_CG_ty.sync(); + + cg::thread_block_tile tiled_part = cg::tiled_partition(thread_block_CG_ty); + + input = tiled_part.thread_rank(); + + switch (shfl_test) { + case (TiledGroupShflTests::shflDown): + output_sum = reduction_kernel_shfl_down(tiled_part, input); + break; + case (TiledGroupShflTests::shflXor): + output_sum = reduction_kernel_shfl_xor(tiled_part, input); + break; + case (TiledGroupShflTests::shflUp): + output_sum = prefix_sum_kernel(tiled_part, input); + result[thread_block_CG_ty.thread_rank()] = output_sum; + } + + if (tiled_part.thread_rank() == 0 && shfl_test != TiledGroupShflTests::shflUp) { + printf(" Sum of all ranks 0..%d in this tiled_part group is %d\n", tiled_part.size() - 1, + output_sum); + result[thread_block_CG_ty.thread_rank() / (tile_size)] = output_sum; + } +} + +static void expected_result_calc(int* expected_result, int tile_size, int size, + TiledGroupShflTests shfl_test) { + switch (shfl_test) { + case (TiledGroupShflTests::shflDown): + case (TiledGroupShflTests::shflXor): { + int expected_sum = ((tile_size - 1) * tile_size / 2); + for (int i = 0; i < size; i++) { + expected_result[i] = expected_sum; + } + break; + } + case (TiledGroupShflTests::shflUp): { + for (int i = 0; i < size / tile_size; i++) { + int acc = 0; + for (int j = 0; j < tile_size; j++) { + acc += j; + expected_result[i * tile_size + j] = acc; + } + } + break; + } + } +} + +template static void test_group_partition(TiledGroupShflTests shfl_test) { + int block_size = 1; + int threads_per_blk = 64; + + int num_elem = (block_size * threads_per_blk) / tile_size; + if (shfl_test == TiledGroupShflTests::shflUp) { + num_elem = block_size * threads_per_blk; + } + + int* expected_result = new int[num_elem]; + + int* result_dev = NULL; + int* result_host = NULL; + + HIP_CHECK(hipHostMalloc(&result_host, num_elem * sizeof(int), hipHostMallocDefault)); + memset(result_host, 0, num_elem * sizeof(int)); + + HIP_CHECK(hipMalloc(&result_dev, num_elem * sizeof(int))); + + // Launch Kernel + hipLaunchKernelGGL(kernel_cg_group_partition_static, block_size, threads_per_blk, + threads_per_blk * sizeof(int), 0, result_dev, shfl_test); + HIP_CHECK(hipDeviceSynchronize()); + + + HIP_CHECK(hipMemcpy(result_host, result_dev, sizeof(int) * num_elem, hipMemcpyDeviceToHost)); + + expected_result_calc(expected_result, tile_size, num_elem, shfl_test); + compareResults(expected_result, result_host, num_elem * sizeof(int)); + + // Free all allocated memory on host and device + HIP_CHECK(hipFree(result_dev)); + HIP_CHECK(hipHostFree(result_host)); + delete[] expected_result; +} + +TEST_CASE("Unit_hipCGThreadBlockTileType_Shfl") { + // Use default device for validating the test + int device; + hipDeviceProp_t device_properties; + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); + + if (!device_properties.cooperativeLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + + TiledGroupShflTests shfl_test = GENERATE( + TiledGroupShflTests::shflDown, TiledGroupShflTests::shflXor, TiledGroupShflTests::shflUp); + test_group_partition<2>(shfl_test); + test_group_partition<4>(shfl_test); + test_group_partition<8>(shfl_test); + test_group_partition<16>(shfl_test); + test_group_partition<32>(shfl_test); +} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockType.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockType.cc deleted file mode 100644 index 98f611aa8a..0000000000 --- a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockType.cc +++ /dev/null @@ -1,177 +0,0 @@ -/* -Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include - -#define ASSERT_EQUAL(lhs, rhs) HIPASSERT(lhs == rhs) - -using namespace cooperative_groups; - -static __global__ -void kernel_cg_thread_block_type(int *sizeTestD, - int *thdRankTestD, - int *syncTestD, - dim3 *groupIndexTestD, - dim3 *thdIndexTestD, - dim3 *groupDimTestD) -{ - thread_block tb = this_thread_block(); - int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; - // Test size - sizeTestD[gIdx] = tb.size(); - - // Test thread_rank - thdRankTestD[gIdx] = tb.thread_rank(); - - // Test sync - __shared__ int sm[2]; - if (threadIdx.x == 0) - sm[0] = 10; - else if (threadIdx.x == 1) - sm[1] = 20; - tb.sync(); - syncTestD[gIdx] = sm[1] * sm[0]; - - // Test group_index - groupIndexTestD[gIdx] = tb.group_index(); - - // Test thread_index - thdIndexTestD[gIdx] = tb.thread_index(); - - // Test group_dim aka number of threads in a block - groupDimTestD[gIdx] = tb.group_dim(); -} - -static void test_cg_thread_block_type(int blockSize) -{ - int nBytes = sizeof(int) * 2 * blockSize; - int nDim3Bytes = sizeof(dim3) * 2 * blockSize; - int *sizeTestD, *sizeTestH; - int *thdRankTestD, *thdRankTestH; - int *syncTestD, *syncTestH; - dim3 *groupIndexTestD, *groupIndexTestH; - dim3 *thdIndexTestD, *thdIndexTestH, *groupDimTestD, *groupDimTestH; - - // Allocate device memory - HIPCHECK(hipMalloc(&sizeTestD, nBytes)); - HIPCHECK(hipMalloc(&thdRankTestD, nBytes)); - HIPCHECK(hipMalloc(&syncTestD, nBytes)); - HIPCHECK(hipMalloc(&groupIndexTestD, nDim3Bytes)); - HIPCHECK(hipMalloc(&thdIndexTestD, nDim3Bytes)); - HIPCHECK(hipMalloc(&groupDimTestD, nDim3Bytes)); - - // Allocate host memory - HIPCHECK(hipHostMalloc(&sizeTestH, nBytes)); - HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes)); - HIPCHECK(hipHostMalloc(&syncTestH, nBytes)); - HIPCHECK(hipHostMalloc(&groupIndexTestH, nDim3Bytes)); - HIPCHECK(hipHostMalloc(&thdIndexTestH, nDim3Bytes)); - HIPCHECK(hipHostMalloc(&groupDimTestH, nDim3Bytes)); - - // Launch Kernel - hipLaunchKernelGGL(kernel_cg_thread_block_type, - 2, - blockSize, - 0, - 0, - sizeTestD, - thdRankTestD, - syncTestD, - groupIndexTestD, - thdIndexTestD, - groupDimTestD); - - // Copy result from device to host - HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(groupIndexTestH, groupIndexTestD, nDim3Bytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(thdIndexTestH, thdIndexTestD, nDim3Bytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(groupDimTestH, groupDimTestD, nDim3Bytes, hipMemcpyDeviceToHost)); - - // Validate results for both blocks together - for (int i = 0; i < 2 * blockSize; ++i) { - ASSERT_EQUAL(sizeTestH[i], blockSize); - ASSERT_EQUAL(thdRankTestH[i], i % blockSize); - ASSERT_EQUAL(syncTestH[i], 200); - ASSERT_EQUAL(groupIndexTestH[i].x, (uint) i / blockSize); - ASSERT_EQUAL(groupIndexTestH[i].y, 0); - ASSERT_EQUAL(groupIndexTestH[i].z, 0); - ASSERT_EQUAL(thdIndexTestH[i].x, (uint) i % blockSize); - ASSERT_EQUAL(thdIndexTestH[i].y, 0); - ASSERT_EQUAL(thdIndexTestH[i].z, 0); - ASSERT_EQUAL(groupDimTestH[i].x, blockSize); - ASSERT_EQUAL(groupDimTestH[i].y, 1); - ASSERT_EQUAL(groupDimTestH[i].z, 1); - } - - // Free device memory - HIPCHECK(hipFree(sizeTestD)); - HIPCHECK(hipFree(thdRankTestD)); - HIPCHECK(hipFree(syncTestD)); - HIPCHECK(hipFree(groupIndexTestD)); - HIPCHECK(hipFree(thdIndexTestD)); - HIPCHECK(hipFree(groupDimTestD)); - - //Free host memory - HIPCHECK(hipHostFree(sizeTestH)); - HIPCHECK(hipHostFree(thdRankTestH)); - HIPCHECK(hipHostFree(syncTestH)); - HIPCHECK(hipHostFree(groupIndexTestH)); - HIPCHECK(hipHostFree(thdIndexTestH)); - HIPCHECK(hipHostFree(groupDimTestH)); -} - -TEST_CASE("Unit_hipCGThreadBlockType") { - // Use default device for validating the test - int deviceId; - hipDeviceProp_t deviceProperties; - HIPCHECK(hipGetDevice(&deviceId)); - HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId)); - - if (!deviceProperties.cooperativeLaunch) { - HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); - return; - } - - // Test for blockSizes in powers of 2 - int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; - for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) { - test_cg_thread_block_type(blockSize); - } - - // Test for random blockSizes, but the sequence is the same every execution - srand(0); - for (int i = 0; i < 10; i++) { - // Test fails for only 1 thread per block - test_cg_thread_block_type(max(2, rand() % maxThreadsPerBlock)); - } -} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTypeViaBaseType.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTypeViaBaseType.cc deleted file mode 100644 index 69f5e91ad2..0000000000 --- a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTypeViaBaseType.cc +++ /dev/null @@ -1,136 +0,0 @@ -/* -Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include "hip/hip_cooperative_groups.h" -#include - -#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs) - -using namespace cooperative_groups; - -static __global__ -void kernel_cg_thread_block_type_via_base_type(int *sizeTestD, - int *thdRankTestD, - int *syncTestD) -{ - thread_group tg = this_thread_block(); - int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; - - // Test size - sizeTestD[gIdx] = tg.size(); - - // Test thread_rank - thdRankTestD[gIdx] = tg.thread_rank(); - - // Test sync - __shared__ int sm[2]; - if (threadIdx.x == 0) - sm[0] = 10; - else if (threadIdx.x == 1) - sm[1] = 20; - tg.sync(); - syncTestD[gIdx] = sm[1] * sm[0]; -} - -static void test_cg_thread_block_type_via_base_type(int blockSize) -{ - int nBytes = sizeof(int) * 2 * blockSize; - int *sizeTestD, *sizeTestH; - int *thdRankTestD, *thdRankTestH; - int *syncTestD, *syncTestH; - - // Allocate device memory - HIPCHECK(hipMalloc(&sizeTestD, nBytes)); - HIPCHECK(hipMalloc(&thdRankTestD, nBytes)); - HIPCHECK(hipMalloc(&syncTestD, nBytes)); - - // Allocate host memory - HIPCHECK(hipHostMalloc(&sizeTestH, nBytes)); - HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes)); - HIPCHECK(hipHostMalloc(&syncTestH, nBytes)); - - // Launch Kernel - hipLaunchKernelGGL(kernel_cg_thread_block_type_via_base_type, - 2, - blockSize, - 0, - 0, - sizeTestD, - thdRankTestD, - syncTestD); - - // Copy result from device to host - HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost)); - - // Validate results for both blocks together - for (int i = 0; i < 2 * blockSize; ++i) { - ASSERT_EQUAL(sizeTestH[i], blockSize); - ASSERT_EQUAL(thdRankTestH[i], i % blockSize); - ASSERT_EQUAL(syncTestH[i], 200); - } - - // Free device memory - HIPCHECK(hipFree(sizeTestD)); - HIPCHECK(hipFree(thdRankTestD)); - HIPCHECK(hipFree(syncTestD)); - - //Free host memory - HIPCHECK(hipHostFree(sizeTestH)); - HIPCHECK(hipHostFree(thdRankTestH)); - HIPCHECK(hipHostFree(syncTestH)); -} - -TEST_CASE("Unit_hipCGThreadBlockType_BaseType") { - // Use default device for validating the test - int deviceId; - hipDeviceProp_t deviceProperties; - HIPCHECK(hipGetDevice(&deviceId)); - HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId)); - - if (!deviceProperties.cooperativeLaunch) { - HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); - return; - } - - // Test for blockSizes in powers of 2 - int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; - for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) { - test_cg_thread_block_type_via_base_type(blockSize); - } - - // Test for random blockSizes, but the sequence is the same every execution - srand(0); - for (int i = 0; i < 10; i++) { - // Test fails for only 1 thread per block - test_cg_thread_block_type_via_base_type(max(2, rand() % maxThreadsPerBlock)); - } -} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTypeViaPublicApi.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTypeViaPublicApi.cc deleted file mode 100644 index f4913ad2c7..0000000000 --- a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockTypeViaPublicApi.cc +++ /dev/null @@ -1,136 +0,0 @@ -/* -Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include "hip/hip_cooperative_groups.h" -#include - -#define ASSERT_EQUAL(lhs, rhs) assert(lhs == rhs) - -using namespace cooperative_groups; - -static __global__ -void kernel_cg_thread_block_type_via_public_api(int *sizeTestD, - int *thdRankTestD, - int *syncTestD) -{ - thread_block tb = this_thread_block(); - int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; - - // Test group_size api - sizeTestD[gIdx] = group_size(tb); - - // Test thread_rank api - thdRankTestD[gIdx] = thread_rank(tb); - - // Test sync api - __shared__ int sm[2]; - if (threadIdx.x == 0) - sm[0] = 10; - else if (threadIdx.x == 1) - sm[1] = 20; - sync(tb); - syncTestD[gIdx] = sm[1] * sm[0]; -} - -static void test_cg_thread_block_type_via_public_api(int blockSize) -{ - int nBytes = sizeof(int) * 2 * blockSize; - int *sizeTestD, *sizeTestH; - int *thdRankTestD, *thdRankTestH; - int *syncTestD, *syncTestH; - - // Allocate device memory - HIPCHECK(hipMalloc(&sizeTestD, nBytes)); - HIPCHECK(hipMalloc(&thdRankTestD, nBytes)); - HIPCHECK(hipMalloc(&syncTestD, nBytes)); - - // Allocate host memory - HIPCHECK(hipHostMalloc(&sizeTestH, nBytes)); - HIPCHECK(hipHostMalloc(&thdRankTestH, nBytes)); - HIPCHECK(hipHostMalloc(&syncTestH, nBytes)); - - // Launch Kernel - hipLaunchKernelGGL(kernel_cg_thread_block_type_via_public_api, - 2, - blockSize, - 0, - 0, - sizeTestD, - thdRankTestD, - syncTestD); - - // Copy result from device to host - HIPCHECK(hipMemcpy(sizeTestH, sizeTestD, nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(thdRankTestH, thdRankTestD, nBytes, hipMemcpyDeviceToHost)); - HIPCHECK(hipMemcpy(syncTestH, syncTestD, nBytes, hipMemcpyDeviceToHost)); - - // Validate results for both blocks together - for (int i = 0; i < 2 * blockSize; ++i) { - ASSERT_EQUAL(sizeTestH[i], blockSize); - ASSERT_EQUAL(thdRankTestH[i], i % blockSize); - ASSERT_EQUAL(syncTestH[i], 200); - } - - // Free device memory - HIPCHECK(hipFree(sizeTestD)); - HIPCHECK(hipFree(thdRankTestD)); - HIPCHECK(hipFree(syncTestD)); - - //Free host memory - HIPCHECK(hipHostFree(sizeTestH)); - HIPCHECK(hipHostFree(thdRankTestH)); - HIPCHECK(hipHostFree(syncTestH)); -} - -TEST_CASE("Unit_hipCGThreadBlockType_PublicApi") { - // Use default device for validating the test - int deviceId; - hipDeviceProp_t deviceProperties; - HIPCHECK(hipGetDevice(&deviceId)); - HIPCHECK(hipGetDeviceProperties(&deviceProperties, deviceId)); - - if (!deviceProperties.cooperativeLaunch) { - HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); - return; - } - - // Test for blockSizes in powers of 2 - int maxThreadsPerBlock = deviceProperties.maxThreadsPerBlock; - for (int blockSize = 2; blockSize <= maxThreadsPerBlock; blockSize = blockSize*2) { - test_cg_thread_block_type_via_public_api(blockSize); - } - - // Test for random blockSizes, but the sequence is the same every execution - srand(0); - for (int i = 0; i < 10; i++) { - // Test fails for only 1 thread per block - test_cg_thread_block_type_via_public_api(max(2, rand() % maxThreadsPerBlock)); - } -} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockType_old.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockType_old.cc new file mode 100644 index 0000000000..87ec21d748 --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGThreadBlockType_old.cc @@ -0,0 +1,225 @@ +/* +Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include + +#include "hip_cg_common.hh" + +namespace cg = cooperative_groups; + +enum class ThreadBlockTypeTests { basicApi, baseType, publicApi }; + +static __global__ void kernel_cg_thread_block_type(int* size_dev, int* thd_rank_dev, int* sync_dev, + dim3* group_index_dev, dim3* thd_index_dev, + dim3* group_dim_dev) { + cg::thread_block tb = cg::this_thread_block(); + int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; + // Test size + size_dev[gIdx] = tb.size(); + + // Test thread_rank + thd_rank_dev[gIdx] = tb.thread_rank(); + + // Test sync + __shared__ int sm[2]; + if (threadIdx.x == 0) + sm[0] = 10; + else if (threadIdx.x == 1) + sm[1] = 20; + tb.sync(); + sync_dev[gIdx] = sm[1] * sm[0]; + + // Test group_index + group_index_dev[gIdx] = tb.group_index(); + + // Test thread_index + thd_index_dev[gIdx] = tb.thread_index(); + + // Test group_dim aka number of threads in a block + group_dim_dev[gIdx] = tb.group_dim(); +} + +static __global__ void kernel_cg_thread_block_type_via_base_type(int* size_dev, int* thd_rank_dev, + int* sync_dev) { + cg::thread_group tg = cg::this_thread_block(); + int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; + + // Test size + size_dev[gIdx] = tg.size(); + + // Test thread_rank + thd_rank_dev[gIdx] = tg.thread_rank(); + + // Test sync + __shared__ int sm[2]; + if (threadIdx.x == 0) + sm[0] = 10; + else if (threadIdx.x == 1) + sm[1] = 20; + tg.sync(); + sync_dev[gIdx] = sm[1] * sm[0]; +} + +static __global__ void kernel_cg_thread_block_type_via_public_api(int* size_dev, int* thd_rank_dev, + int* sync_dev) { + cg::thread_block tb = cg::this_thread_block(); + int gIdx = (blockIdx.x * blockDim.x) + threadIdx.x; + + // Test group_size api + size_dev[gIdx] = cg::group_size(tb); + + // Test thread_rank api + thd_rank_dev[gIdx] = cg::thread_rank(tb); + + // Test sync api + __shared__ int sm[2]; + if (threadIdx.x == 0) + sm[0] = 10; + else if (threadIdx.x == 1) + sm[1] = 20; + cg::sync(tb); + sync_dev[gIdx] = sm[1] * sm[0]; +} + +static void test_cg_thread_block_type(ThreadBlockTypeTests test_type, int block_size) { + int num_bytes = sizeof(int) * 2 * block_size; + int num_dim3_bytes = sizeof(dim3) * 2 * block_size; + int *size_dev, *size_host; + int *thd_rank_dev, *thd_rank_host; + int *sync_dev, *sync_host; + dim3 *group_index_dev, *group_index_host; + dim3 *thd_index_dev, *thd_index_host; + dim3 *group_dim_dev, *group_dim_host; + + // Allocate device memory + HIP_CHECK(hipMalloc(&size_dev, num_bytes)); + HIP_CHECK(hipMalloc(&thd_rank_dev, num_bytes)); + HIP_CHECK(hipMalloc(&sync_dev, num_bytes)); + + // Allocate host memory + HIP_CHECK(hipHostMalloc(&size_host, num_bytes)); + HIP_CHECK(hipHostMalloc(&thd_rank_host, num_bytes)); + HIP_CHECK(hipHostMalloc(&sync_host, num_bytes)); + + switch (test_type) { + case (ThreadBlockTypeTests::basicApi): + HIP_CHECK(hipMalloc(&group_index_dev, num_dim3_bytes)); + HIP_CHECK(hipMalloc(&thd_index_dev, num_dim3_bytes)); + HIP_CHECK(hipMalloc(&group_dim_dev, num_dim3_bytes)); + HIP_CHECK(hipHostMalloc(&group_index_host, num_dim3_bytes)); + HIP_CHECK(hipHostMalloc(&thd_index_host, num_dim3_bytes)); + HIP_CHECK(hipHostMalloc(&group_dim_host, num_dim3_bytes)); + + hipLaunchKernelGGL(kernel_cg_thread_block_type, 2, block_size, 0, 0, size_dev, thd_rank_dev, + sync_dev, group_index_dev, thd_index_dev, group_dim_dev); + break; + case (ThreadBlockTypeTests::baseType): + hipLaunchKernelGGL(kernel_cg_thread_block_type_via_base_type, 2, block_size, 0, 0, size_dev, + thd_rank_dev, sync_dev); + break; + case (ThreadBlockTypeTests::publicApi): + hipLaunchKernelGGL(kernel_cg_thread_block_type_via_public_api, 2, block_size, 0, 0, size_dev, + thd_rank_dev, sync_dev); + } + + // Copy result from device to host + HIP_CHECK(hipMemcpy(size_host, size_dev, num_bytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(thd_rank_host, thd_rank_dev, num_bytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(sync_host, sync_dev, num_bytes, hipMemcpyDeviceToHost)); + if (test_type == ThreadBlockTypeTests::basicApi) { + HIP_CHECK(hipMemcpy(group_index_host, group_index_dev, num_dim3_bytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(thd_index_host, thd_index_dev, num_dim3_bytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(group_dim_host, group_dim_dev, num_dim3_bytes, hipMemcpyDeviceToHost)); + } + + // Validate results for both blocks together + for (int i = 0; i < 2 * block_size; ++i) { + ASSERT_EQUAL(size_host[i], block_size); + ASSERT_EQUAL(thd_rank_host[i], i % block_size); + ASSERT_EQUAL(sync_host[i], 200); + if (test_type == ThreadBlockTypeTests::basicApi) { + ASSERT_EQUAL(group_index_host[i].x, (uint)i / block_size); + ASSERT_EQUAL(group_index_host[i].y, 0); + ASSERT_EQUAL(group_index_host[i].z, 0); + ASSERT_EQUAL(thd_index_host[i].x, (uint)i % block_size); + ASSERT_EQUAL(thd_index_host[i].y, 0); + ASSERT_EQUAL(thd_index_host[i].z, 0); + ASSERT_EQUAL(group_dim_host[i].x, block_size); + ASSERT_EQUAL(group_dim_host[i].y, 1); + ASSERT_EQUAL(group_dim_host[i].z, 1); + } + } + + // Free device memory + HIP_CHECK(hipFree(size_dev)); + HIP_CHECK(hipFree(thd_rank_dev)); + HIP_CHECK(hipFree(sync_dev)); + + // Free host memory + HIP_CHECK(hipHostFree(size_host)); + HIP_CHECK(hipHostFree(thd_rank_host)); + HIP_CHECK(hipHostFree(sync_host)); + + if (test_type == ThreadBlockTypeTests::basicApi) { + HIP_CHECK(hipFree(group_index_dev)); + HIP_CHECK(hipFree(thd_index_dev)); + HIP_CHECK(hipFree(group_dim_dev)); + HIP_CHECK(hipHostFree(group_index_host)); + HIP_CHECK(hipHostFree(thd_index_host)); + HIP_CHECK(hipHostFree(group_dim_host)); + } +} + + +TEST_CASE("Unit_hipCGThreadBlockType") { + // Use default device for validating the test + int device; + hipDeviceProp_t device_properties; + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); + + if (!device_properties.cooperativeLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + + ThreadBlockTypeTests test_type = ThreadBlockTypeTests::basicApi; + + SECTION("Default thread block API test") { test_type = ThreadBlockTypeTests::basicApi; } + + SECTION("Base type thread block API test") { test_type = ThreadBlockTypeTests::baseType; } + + SECTION("Public API thread block test") { test_type = ThreadBlockTypeTests::publicApi; } + + // Test for blockSizes in powers of 2 + int max_threads_per_blk = device_properties.maxThreadsPerBlock; + for (int block_size = 2; block_size <= max_threads_per_blk; block_size = block_size * 2) { + test_cg_thread_block_type(test_type, block_size); + } + + // Test for random block_size, but the sequence is the same every execution + srand(0); + for (int i = 0; i < 10; i++) { + // Test fails for only 1 thread per block + test_cg_thread_block_type(test_type, max(2, rand() % max_threads_per_blk)); + } +} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGTiledPartition.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGTiledPartition.cc deleted file mode 100644 index 783d7c8036..0000000000 --- a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGTiledPartition.cc +++ /dev/null @@ -1,385 +0,0 @@ -/* -Copyright (c) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -// Test Description: -/* This test implements sum reduction kernel, first with each threads own rank - as input and comparing the sum with expected sum output derieved from n(n-1)/2 - formula. The second part, partitions this parent group into child subgroups - a.k.a tiles using using tiled_partition() collective operation. This can be called - with a static tile size, passed in templated non-type variable-tiled_partition, - or in runtime as tiled_partition(thread_group parent, tileSz). This test covers both these - cases. - This test tests functionality of cg group partitioning, (static and dynamic) and its respective - API's size(), thread_rank(), and sync(). -*/ - -#include -#include -#include -#include - -using namespace cooperative_groups; - -/* Parallel reduce kernel. - * - * Step complexity: O(log n) - * Work complexity: O(n) - * - * Note: This kernel works only with power of 2 input arrays. - */ -__device__ int reduction_kernel(thread_group g, int* x, int val) { - int lane = g.thread_rank(); - - for (int i = g.size() / 2; i > 0; i /= 2) { - // use lds to store the temporary result - x[lane] = val; - // Ensure all the stores are completed. - g.sync(); - - if (lane < i) { - val += x[lane + i]; - } - // It must work on one tiled thread group at a time, - // and it must make sure all memory operations are - // completed before moving to the next stride. - // sync() here just does that. - g.sync(); - } - - // Choose the 0'th indexed thread that holds the reduction value to return - if (g.thread_rank() == 0) { - return val; - } - // Rest of the threads return no useful values - else { - return -1; - } -} - -template -__global__ void kernel_cg_group_partition_static(int* result, bool isGlobalMem, int* globalMem) { - thread_block threadBlockCGTy = this_thread_block(); - int threadBlockGroupSize = threadBlockCGTy.size(); - - int* workspace = NULL; - - if (isGlobalMem) { - workspace = globalMem; - } else { - // Declare a shared memory - extern __shared__ int sharedMem[]; - workspace = sharedMem; - } - - int input, outputSum, expectedOutput; - - // we pass its own thread rank as inputs - input = threadBlockCGTy.thread_rank(); - - expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2; - - outputSum = reduction_kernel(threadBlockCGTy, workspace, input); - - // Choose a leader thread to print the results - if (threadBlockCGTy.thread_rank() == 0) { - printf(" Sum of all ranks 0..%d in threadBlockCooperativeGroup is %d (expected %d)\n\n", - (int)threadBlockCGTy.size() - 1, outputSum, expectedOutput); - printf(" Creating %d groups, of tile size %d threads:\n\n", - (int)threadBlockCGTy.size() / tileSz, tileSz); - } - - threadBlockCGTy.sync(); - - thread_block_tile tiledPartition = tiled_partition(threadBlockCGTy); - - // This offset allows each group to have its own unique area in the workspace array - int workspaceOffset = threadBlockCGTy.thread_rank() - tiledPartition.thread_rank(); - - outputSum = reduction_kernel(tiledPartition, workspace + workspaceOffset, input); - - if (tiledPartition.thread_rank() == 0) { - printf( - " Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread " - "rank via meta_group_rank : %d and the total number of groups created when partitioned : %d\n", - tiledPartition.size() - 1, outputSum, tiledPartition.meta_group_rank(), tiledPartition.meta_group_size()); - result[input / (tileSz)] = outputSum; - } - return; -} - - -__global__ void kernel_cg_group_partition_dynamic(unsigned int tileSz, int* result, - bool isGlobalMem, int* globalMem) { - thread_block threadBlockCGTy = this_thread_block(); - - int* workspace = NULL; - - if (isGlobalMem) { - workspace = globalMem; - } else { - // Declare a shared memory - extern __shared__ int sharedMem[]; - workspace = sharedMem; - } - - int input, outputSum; - - // input to reduction, for each thread, is its' rank in the group - input = threadBlockCGTy.thread_rank(); - - outputSum = reduction_kernel(threadBlockCGTy, workspace, input); - - if (threadBlockCGTy.thread_rank() == 0) { - printf(" Sum of all ranks 0..%d in threadBlockCooperativeGroup is %d\n\n", - (int)threadBlockCGTy.size() - 1, outputSum); - printf(" Creating %d groups, of tile size %d threads:\n\n", - (int)threadBlockCGTy.size() / tileSz, tileSz); - } - - threadBlockCGTy.sync(); - - thread_group tiledPartition = tiled_partition(threadBlockCGTy, tileSz); - - // This offset allows each group to have its own unique area in the workspace array - int workspaceOffset = threadBlockCGTy.thread_rank() - tiledPartition.thread_rank(); - - outputSum = reduction_kernel(tiledPartition, workspace + workspaceOffset, input); - - if (tiledPartition.thread_rank() == 0) { - printf( - " Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread " - " %d\n", tiledPartition.size() - 1, outputSum, input); - result[input / (tileSz)] = outputSum; - } - return; -} - -// Search if the sum exists in the expected results array -void verifyResults(int* hPtr, int* dPtr, int size) { - int i = 0, j = 0; - for (i = 0; i < size; i++) { - for (j = 0; j < size; j++) { - if (hPtr[i] == dPtr[j]) { - break; - } - } - if (j == size) { - REQUIRE(" Result verification failed!"); - } - } -} - - -template static void test_group_partition(bool useGlobalMem) { - hipError_t err; - int blockSize = 1; - int threadsPerBlock = 64; - - int numTiles = (blockSize * threadsPerBlock) / tileSz; - - // Build an array of expected reduction sum output on the host - // based on the sum of their respective thread ranks for verification. - // eg: parent group has 64threads. - // child thread ranks: 0-15, 16-31, 32-47, 48-63 - // expected sum: 120, 376, 632, 888 - int* expectedSum = new int[numTiles]; - int temp = 0, sum = 0; - - for (int i = 1; i <= numTiles; i++) { - sum = temp; - temp = (((tileSz * i) - 1) * (tileSz * i)) / 2; - expectedSum[i-1] = temp - sum; - } - - int* dResult = NULL; - HIPCHECK(hipMalloc((void**)&dResult, numTiles * sizeof(int))); - - int* globalMem = NULL; - if (useGlobalMem) { - HIPCHECK(hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int))); - } - - int* hResult = NULL; - HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault)); - memset(hResult, 0, numTiles * sizeof(int)); - - if (useGlobalMem) { - // Launch Kernel - hipLaunchKernelGGL(kernel_cg_group_partition_static, blockSize, threadsPerBlock, 0, 0, - dResult, useGlobalMem, globalMem); - err = hipDeviceSynchronize(); - if (err != hipSuccess) { - fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err)); - } - } else { - // Launch Kernel - hipLaunchKernelGGL(kernel_cg_group_partition_static, blockSize, threadsPerBlock, - threadsPerBlock * sizeof(int), 0, dResult, useGlobalMem, globalMem); - err = hipDeviceSynchronize(); - if (err != hipSuccess) { - fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err)); - } - } - - HIPCHECK(hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost)); - - verifyResults(expectedSum, hResult, numTiles); - - // Free all allocated memory on host and device - HIPCHECK(hipFree(dResult)); - HIPCHECK(hipFree(hResult)); - if (useGlobalMem) { - HIPCHECK(hipFree(globalMem)); - } - delete[] expectedSum; - - printf("\n...PASSED.\n\n"); -} - -static void test_group_partition(unsigned int tileSz, bool useGlobalMem) { - hipError_t err; - int blockSize = 1; - int threadsPerBlock = 64; - - int numTiles = (blockSize * threadsPerBlock) / tileSz; - // Build an array of expected reduction sum output on the host - // based on the sum of their respective thread ranks to use for verification - int* expectedSum = new int[numTiles]; - int temp = 0, sum = 0; - for (int i = 1; i <= numTiles; i++) { - sum = temp; - temp = (((tileSz * i) - 1) * (tileSz * i)) / 2; - expectedSum[i-1] = temp - sum; - } - - int* dResult = NULL; - HIPCHECK(hipMalloc(&dResult, sizeof(int) * numTiles)); - - int* globalMem = NULL; - if (useGlobalMem) { - HIPCHECK(hipMalloc((void**)&globalMem, threadsPerBlock * sizeof(int))); - } - - int* hResult = NULL; - HIPCHECK(hipHostMalloc(&hResult, numTiles * sizeof(int), hipHostMallocDefault)); - memset(hResult, 0, numTiles * sizeof(int)); - - // Launch Kernel - if (useGlobalMem) { - hipLaunchKernelGGL(kernel_cg_group_partition_dynamic, blockSize, threadsPerBlock, 0, 0, tileSz, - dResult, useGlobalMem, globalMem); - - err = hipDeviceSynchronize(); - if (err != hipSuccess) { - fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err)); - } - } else { - hipLaunchKernelGGL(kernel_cg_group_partition_dynamic, blockSize, threadsPerBlock, - threadsPerBlock * sizeof(int), 0, tileSz, dResult, useGlobalMem, globalMem); - - err = hipDeviceSynchronize(); - if (err != hipSuccess) { - fprintf(stderr, "Failed to launch kernel (error code %s)!\n", hipGetErrorString(err)); - } - } - - HIPCHECK(hipMemcpy(hResult, dResult, numTiles * sizeof(int), hipMemcpyDeviceToHost)); - - verifyResults(expectedSum, hResult, numTiles); - - // Free all allocated memory on host and device - HIPCHECK(hipFree(dResult)); - HIPCHECK(hipFree(hResult)); - if (useGlobalMem) { - HIPCHECK(hipFree(globalMem)); - } - delete[] expectedSum; - - printf("\n...PASSED.\n\n"); -} - -TEST_CASE("Unit_tiled_partition") { - // Use default device for validating the test - int deviceId; - HIP_CHECK_ERROR(hipGetDevice(&deviceId), hipSuccess); - hipDeviceProp_t deviceProperties; - HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProperties, deviceId), hipSuccess); - - if (!deviceProperties.cooperativeLaunch) { - HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); - } - - bool useGlobalMem = true; - std::cout << "Testing static tiled_partition for different tile sizes" << std::endl; - std::cout << "\nUsing global memory for computation\n"; - /* Test static tile_partition */ - std::cout << "TEST 1:" << '\n' << std::endl; - test_group_partition<2>(useGlobalMem); - std::cout << "TEST 2:" << '\n' << std::endl; - test_group_partition<4>(useGlobalMem); - std::cout << "TEST 3:" << '\n' << std::endl; - test_group_partition<8>(useGlobalMem); - std::cout << "TEST 4:" << '\n' << std::endl; - test_group_partition<16>(useGlobalMem); - std::cout << "TEST 5:" << '\n' << std::endl; - test_group_partition<32>(useGlobalMem); - - useGlobalMem = false; - std::cout << "Testing static tiled_partition for different tile sizes" << std::endl; - std::cout << "\nUsing shared memory for computation\n"; - /* Test static tile_partition */ - std::cout << "TEST 1:" << '\n' << std::endl; - test_group_partition<2>(useGlobalMem); - std::cout << "TEST 2:" << '\n' << std::endl; - test_group_partition<4>(useGlobalMem); - std::cout << "TEST 3:" << '\n' << std::endl; - test_group_partition<8>(useGlobalMem); - std::cout << "TEST 4:" << '\n' << std::endl; - test_group_partition<16>(useGlobalMem); - std::cout << "TEST 5:" << '\n' << std::endl; - test_group_partition<32>(useGlobalMem); - - - std::cout << "Now testing dynamic tiled_partition for different tile sizes" << '\n' << std::endl; - - /* Test dynamic group partition*/ - useGlobalMem = true; - int testNo = 1; - std::vector tileSizes = {2, 4, 8, 16, 32}; - std::cout << "\nUsing global memory for computation\n"; - for (auto i : tileSizes) { - std::cout << "TEST " << testNo << ":" << '\n' << std::endl; - test_group_partition(i, useGlobalMem); - testNo++; - } - - useGlobalMem = false; - testNo = 1; - std::cout << "\nUsing shared memory for computation\n"; - for (auto i : tileSizes) { - std::cout << "TEST " << testNo << ":" << '\n' << std::endl; - test_group_partition(i, useGlobalMem); - testNo++; - } - printf("\n...PASSED.\n\n"); - return; -} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipCGTiledPartitionType_old.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGTiledPartitionType_old.cc new file mode 100644 index 0000000000..af43709ac4 --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipCGTiledPartitionType_old.cc @@ -0,0 +1,279 @@ +/* +Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +// Test Description: +/* This test implements sum reduction kernel, first with each threads own rank + as input and comparing the sum with expected sum output derieved from n(n-1)/2 + formula. The second part, partitions this parent group into child subgroups + a.k.a tiles using using tiled_partition() collective operation. This can be called + with a static tile size, passed in templated non-type variable-tiled_partition, + or in runtime as tiled_partition(thread_group parent, tileSz). This test covers both these + cases. + This test tests functionality of cg group partitioning, (static and dynamic) and its respective + API's size(), thread_rank(), and sync(). +*/ + +#include +#include +#include + +#include "hip_cg_common.hh" + +namespace cg = cooperative_groups; + +/* Parallel reduce kernel. + * + * Step complexity: O(log n) + * Work complexity: O(n) + * + * Note: This kernel works only with power of 2 input arrays. + */ +__device__ int reduction_kernel(cg::thread_group g, int* x, int val) { + int lane = g.thread_rank(); + + for (int i = g.size() / 2; i > 0; i /= 2) { + // use lds to store the temporary result + x[lane] = val; + // Ensure all the stores are completed. + g.sync(); + + if (lane < i) { + val += x[lane + i]; + } + // It must work on one tiled thread group at a time, + // and it must make sure all memory operations are + // completed before moving to the next stride. + // sync() here just does that. + g.sync(); + } + + // Choose the 0'th indexed thread that holds the reduction value to return + if (g.thread_rank() == 0) { + return val; + } + // Rest of the threads return no useful values + else { + return -1; + } +} + +template +__global__ void kernel_cg_group_partition_static(int* result, bool is_global_mem, int* global_mem) { + cg::thread_block thread_block_CG_ty = cg::this_thread_block(); + + int* workspace = NULL; + + if (is_global_mem) { + workspace = global_mem; + } else { + // Declare a shared memory + extern __shared__ int shared_mem[]; + workspace = shared_mem; + } + + int input, output_sum, expected_output; + + // input to reduction, for each thread, is its' rank in the group + input = thread_block_CG_ty.thread_rank(); + + expected_output = (thread_block_CG_ty.size() - 1) * thread_block_CG_ty.size() / 2; + + output_sum = reduction_kernel(thread_block_CG_ty, workspace, input); + + if (thread_block_CG_ty.thread_rank() == 0) { + printf(" Sum of all ranks 0..%d in threadBlockCooperativeGroup is %d (expected %d)\n\n", + (int)thread_block_CG_ty.size() - 1, output_sum, expected_output); + printf(" Creating %d groups, of tile size %d threads:\n\n", + (int)thread_block_CG_ty.size() / tile_size, tile_size); + } + + thread_block_CG_ty.sync(); + + cg::thread_block_tile tiled_part = cg::tiled_partition(thread_block_CG_ty); + + // This offset allows each group to have its own unique area in the workspace array + int workspace_offset = thread_block_CG_ty.thread_rank() - tiled_part.thread_rank(); + + output_sum = reduction_kernel(tiled_part, workspace + workspace_offset, input); + + if (tiled_part.thread_rank() == 0) { + printf( + " Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread " + "rank: via meta_group_rank : %d and the total number of groups created when partitioned : " + "%d\n", + tiled_part.size() - 1, output_sum, tiled_part.meta_group_rank(), + tiled_part.meta_group_size()); + result[input / (tile_size)] = output_sum; + } + return; +} + +__global__ void kernel_cg_group_partition_dynamic(unsigned int tile_size, int* result, + bool is_global_mem, int* global_mem) { + cg::thread_block thread_block_CG_ty = cg::this_thread_block(); + + int* workspace = NULL; + + if (is_global_mem) { + workspace = global_mem; + } else { + // Declare a shared memory + extern __shared__ int shared_mem[]; + workspace = shared_mem; + } + + int input, output_sum; + + // input to reduction, for each thread, is its' rank in the group + input = thread_block_CG_ty.thread_rank(); + + output_sum = reduction_kernel(thread_block_CG_ty, workspace, input); + + if (thread_block_CG_ty.thread_rank() == 0) { + printf("\n\n\n Sum of all ranks 0..%d in threadBlockCooperativeGroup is %d\n\n", + (int)thread_block_CG_ty.size() - 1, output_sum); + printf(" Creating %d groups, of tile size %d threads:\n\n", + (int)thread_block_CG_ty.size() / tile_size, tile_size); + } + + thread_block_CG_ty.sync(); + + cg::thread_group tiled_part = cg::tiled_partition(thread_block_CG_ty, tile_size); + + // This offset allows each group to have its own unique area in the workspace array + int workspace_offset = thread_block_CG_ty.thread_rank() - tiled_part.thread_rank(); + + output_sum = reduction_kernel(tiled_part, workspace + workspace_offset, input); + + if (tiled_part.thread_rank() == 0) { + printf( + " Sum of all ranks 0..%d in this tiledPartition group is %d. Corresponding parent thread " + "rank: %d\n", + static_cast(tiled_part.size()) - 1, output_sum, input); + result[input / (tile_size)] = output_sum; + } + return; +} + +template +static void common_group_partition(F kernel_func, unsigned int tile_size, void** params, + size_t num_params, bool use_global_mem) { + int block_size = 1; + int threads_per_blk = 64; + + int num_tiles = (block_size * threads_per_blk) / tile_size; + + // Build an array of expected reduction sum output on the host + // based on the sum of their respective thread ranks for verification. + // eg: parent group has 64threads. + // child thread ranks: 0-15, 16-31, 32-47, 48-63 + // expected sum: 120, 376, 632, 888 + int* expected_sum = new int[num_tiles]; + int temp = 0, sum = 0; + + for (int i = 1; i <= num_tiles; i++) { + sum = temp; + temp = (((tile_size * i) - 1) * (tile_size * i)) / 2; + expected_sum[i - 1] = temp - sum; + } + + int* result_dev = NULL; + HIP_CHECK(hipMalloc((void**)&result_dev, num_tiles * sizeof(int))); + + int* global_mem = NULL; + if (use_global_mem) { + HIP_CHECK(hipMalloc((void**)&global_mem, threads_per_blk * sizeof(int))); + } + + int* result_host = NULL; + HIP_CHECK(hipHostMalloc(&result_host, num_tiles * sizeof(int), hipHostMallocDefault)); + memset(result_host, 0, num_tiles * sizeof(int)); + + params[num_params + 0] = &result_dev; + params[num_params + 1] = &use_global_mem; + params[num_params + 2] = &global_mem; + + if (use_global_mem) { + // Launch Kernel + HIP_CHECK(hipLaunchCooperativeKernel(kernel_func, block_size, threads_per_blk, params, 0, 0)); + HIP_CHECK(hipDeviceSynchronize()); + } else { + // Launch Kernel + HIP_CHECK(hipLaunchCooperativeKernel(kernel_func, block_size, threads_per_blk, params, + threads_per_blk * sizeof(int), 0)); + HIP_CHECK(hipDeviceSynchronize()); + } + + HIP_CHECK(hipMemcpy(result_host, result_dev, num_tiles * sizeof(int), hipMemcpyDeviceToHost)); + + verifyResults(expected_sum, result_host, num_tiles); + + // Free all allocated memory on host and device + HIP_CHECK(hipFree(result_dev)); + HIP_CHECK(hipHostFree(result_host)); + if (use_global_mem) { + HIP_CHECK(hipFree(global_mem)); + } + delete[] expected_sum; +} + +template static void test_group_partition(bool use_global_mem) { + void* params[3]; + size_t num_params = 0; + common_group_partition(kernel_cg_group_partition_static, tile_size, params, num_params, + use_global_mem); +} + +static void test_group_partition(unsigned int tile_size, bool use_global_mem) { + void* params[4]; + params[0] = &tile_size; + size_t num_params = 1; + common_group_partition(kernel_cg_group_partition_dynamic, tile_size, params, num_params, + use_global_mem); +} + +TEST_CASE("Unit_hipCGThreadBlockTileType") { + // Use default device for validating the test + int device; + hipDeviceProp_t device_properties; + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); + + if (!device_properties.cooperativeLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + + bool use_global_mem = GENERATE(true, false); + + SECTION("Static tile partition") { + test_group_partition<2>(use_global_mem); + test_group_partition<4>(use_global_mem); + test_group_partition<8>(use_global_mem); + test_group_partition<16>(use_global_mem); + test_group_partition<32>(use_global_mem); + } + + SECTION("Dynamic tile partition") { + unsigned int tile_size = GENERATE(2, 4, 8, 16, 32); + test_group_partition(tile_size, use_global_mem); + } +} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernelMultiDevice_old.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernelMultiDevice_old.cc new file mode 100644 index 0000000000..5cde00aa34 --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernelMultiDevice_old.cc @@ -0,0 +1,606 @@ +/* +Copyright (c) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +// Test Description: +/*The general idea of the application is to test how multi-GPU Cooperative +Groups kernel launches to a stream interact with other things that may be +simultaneously running in the same streams. + +The HIP specification says that a multi-GPU cooperative launch will wait +until all of the streams it's using finish their work. Only then will the +cooperative kernel be launched to all of the devices. Then no other work +can take part in the any of the streams until all of the multi-GPU +cooperative work is done. + +However, there are flags that allow you to disable each of these +serialization points: hipCooperativeLaunchMultiDeviceNoPreSync and +hipCooperativeLaunchMultiDeviceNoPostSync. + +As such, this benchmark tests the following five situations launching +to two GPUs (and thus two streams): + + 1. Normal multi-GPU cooperative kernel: + This should result in the following pattern: + Stream 0: Cooperative + Stream 1: Cooperative + 2. Regular kernel launches and multi-GPU cooperative kernel launches + with the default flags, resulting in the following pattern: + Stream 0: Regular --> Cooperative + Stream 1: --> Cooperative --> Regular + + 3. Regular kernel launches and multi-GPU cooperative kernel launches + that turn off "pre-sync". This should allow a cooperative kernel + to launch even if work is already in a stream pointing to + another GPU. + This should result in the following pattern: + Stream 0: Regular --> Cooperative + Stream 1: Cooperative --> Regular + + 4. Regular kernel launches and multi-GPU cooperative kernel launches + that turn off "post-sync". This should allow a new kernel to enter + a GPU even if another GPU still has a cooperative kernel on it. + This should result in the following pattern: + Stream 0: Regular --> Cooperative + Stream 1: --> Cooperative--> Regular + + 5. Regular kernel launches and multi-GPU cooperative kernel launches + that turn off both pre- and post-sync. This should allow any of + the kernels to launch to their GPU regardless of the status of + other kernels in other multi-GPU stream groups. + This should result in the following pattern: + Stream 0: Regular --> Cooperative + Stream 1: Cooperative --> Regular + +We time how long it takes to run each of these benchmarks and print it as +the output of the benchmark. The kernels themselves are just useless time- +wasting code so that the kernel takes a meaningful amount of time on the +GPU before it exits. We only launch a single wavefront for each kernel, so +any serialization should not be because of GPU occupancy concerns. + +If tests 2, 3, and 4 take roughly 3x as long as #1, that implies that +cooperative kernels are serialized as expected. + +If test #5 takes roughly twice as long as #1, that implies that the +overlap-allowing flags work as expected. +*/ + +#include +#include + +namespace cg = cooperative_groups; + +static constexpr size_t kBufferLen = 1024 * 1024; + +__global__ void test_gws(uint* buf, uint buf_size, long* tmp_buf, long* result) { + extern __shared__ long tmp[]; + uint groups = gridDim.x; + uint group_id = blockIdx.x; + uint local_id = threadIdx.x; + uint chunk = gridDim.x * blockDim.x; + + uint i = group_id * blockDim.x + local_id; + long sum = 0; + while (i < buf_size) { + sum += buf[i]; + i += chunk; + } + tmp[local_id] = sum; + __syncthreads(); + i = 0; + if (local_id == 0) { + sum = 0; + while (i < blockDim.x) { + sum += tmp[i]; + i++; + } + tmp_buf[group_id] = sum; + } + // wait + cg::this_grid().sync(); + + if (((blockIdx.x * blockDim.x) + threadIdx.x) == 0) { + for (uint i = 1; i < groups; ++i) { + sum += tmp_buf[i]; + } + //*result = sum; + result[1 + cg::this_multi_grid().grid_rank()] = sum; + } + cg::this_multi_grid().sync(); + if (cg::this_multi_grid().grid_rank() == 0) { + sum = 0; + for (uint i = 1; i <= cg::this_multi_grid().num_grids(); ++i) { + sum += result[i]; + } + *result = sum; + } +} + +__global__ void test_coop_kernel(unsigned int loops, long long* array, int fast_gpu) { + cg::multi_grid_group mgrid = cg::this_multi_grid(); + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + + if (mgrid.grid_rank() == fast_gpu) { + return; + } + + for (int i = 0; i < loops; i++) { + long long time_diff = 0; + long long last_clock = clock64(); + do { + long long cur_clock = clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while (time_diff < 1000000); + array[rank] += clock64(); + } +} + +__global__ void test_coop_kernel_gfx11(unsigned int loops, long long* array, int fast_gpu) { +#if HT_AMD + cg::multi_grid_group mgrid = cg::this_multi_grid(); + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + + if (mgrid.grid_rank() == fast_gpu) { + return; + } + + for (int i = 0; i < loops; i++) { + long long time_diff = 0; + long long last_clock = wall_clock64(); + do { + long long cur_clock = wall_clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while (time_diff < 1000000); + array[rank] += wall_clock64(); + } +#endif +} + +__global__ void test_kernel(uint32_t loops, unsigned long long* array) { + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + + for (int i = 0; i < loops; i++) { + long long time_diff = 0; + long long last_clock = clock64(); + do { + long long cur_clock = clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while (time_diff < 1000000); + array[rank] += clock64(); + } +} + +__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long* array) { +#if HT_AMD + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + + for (int i = 0; i < loops; i++) { + long long time_diff = 0; + long long last_clock = wall_clock64(); + do { + long long cur_clock = wall_clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while (time_diff < 1000000); + array[rank] += wall_clock64(); + } +#endif +} + +static void verify_time(double single_kernel_time, double multi_kernel_time, float low_bound, + float high_bound) { + // Test that multiple kernel times are inside expected boundaries + REQUIRE(multi_kernel_time >= low_bound * single_kernel_time); + REQUIRE(multi_kernel_time <= high_bound * single_kernel_time); +} + +void test_multigrid_streams(int device_num) { + uint32_t loops = 2000; + int32_t fast_gpu = -1; + + // We will launch enough waves to fill up all of the GPU + int warp_sizes[2]; + int num_sms[2]; + hipDeviceProp_t device_properties[2]; + int warp_size = INT_MAX; + int num_sm = INT_MAX; + for (int dev = 0; dev < (device_num - 1); ++dev) { + for (int i = 0; i < 2; i++) { + HIP_CHECK(hipGetDeviceProperties(&device_properties[i], (dev + i))); + warp_sizes[i] = device_properties[i].warpSize; + if (warp_sizes[i] < warp_size) { + warp_size = warp_sizes[i]; + } + num_sms[i] = device_properties[i].multiProcessorCount; + if (num_sms[i] < num_sm) { + num_sm = num_sms[i]; + } + } + + // Calculate the device occupancy to know how many blocks can be run. + int max_blocks_per_sm_arr[2]; + int max_blocks_per_sm = INT_MAX; + for (int i = 0; i < 2; i++) { + HIP_CHECK(hipSetDevice(dev + i)); + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm_arr[i], + test_kernel_used, warp_size, 0)); + if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) { + max_blocks_per_sm = max_blocks_per_sm_arr[i]; + } + } + int desired_blocks = 1; + + if (desired_blocks > max_blocks_per_sm * num_sm) { + INFO("The requested number of blocks will not fit on the GPU"); + REQUIRE(desired_blocks < max_blocks_per_sm * num_sm); + return; + } + + // Create the streams we will use in this test + hipStream_t streams[2]; + for (int i = 0; i < 2; i++) { + HIP_CHECK(hipSetDevice(dev + i)); + HIP_CHECK(hipStreamCreate(&streams[i])); + } + + // Set up data to pass into the kernel + // Alocate the host input buffer, and two device-focused buffers that we + // will use for our test. + unsigned long long* dev_array[2]; + for (int i = 0; i < 2; i++) { + int good_size = desired_blocks * warp_size * sizeof(long long); + HIP_CHECK(hipSetDevice(dev + i)); + HIP_CHECK(hipMalloc(reinterpret_cast(&dev_array[i]), good_size)); + HIP_CHECK(hipMemsetAsync(dev_array[i], 0, good_size, streams[i])); + } + for (int i = 0; i < 2; i++) { + HIP_CHECK(hipSetDevice(dev + i)); + HIP_CHECK(hipDeviceSynchronize()); + } + + /* Launch the kernels ****************************************************/ + void* dev_params[2][3]; + hipLaunchParams md_params[2]; + std::chrono::time_point start_time[2]; + std::chrono::time_point end_time[2]; + + // Test 0: Launching a multi-GPU cooperative kernel + // Both GPUs launch a long cooperative kernel + INFO("GPU " << dev << ": Long Coop Kernel"); + INFO("GPU " << (dev + 1) << ": Long Coop Kernel"); + + auto test_coop_kernel_used = IsGfx11() ? test_coop_kernel_gfx11 : test_coop_kernel; + for (int i = 0; i < 2; i++) { + dev_params[i][0] = reinterpret_cast(&loops); + dev_params[i][1] = reinterpret_cast(&dev_array[i]); + dev_params[i][2] = reinterpret_cast(&fast_gpu); + md_params[i].func = reinterpret_cast(test_coop_kernel_used); + md_params[i].gridDim = desired_blocks; + md_params[i].blockDim = warp_size; + md_params[i].sharedMem = 0; + md_params[i].stream = streams[i]; + md_params[i].args = dev_params[i]; + } + + start_time[0] = std::chrono::system_clock::now(); + HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); + for (int i = 0; i < 2; i++) { + HIP_CHECK(hipSetDevice(dev + i)); + HIP_CHECK(hipDeviceSynchronize()); + } + end_time[0] = std::chrono::system_clock::now(); + + std::chrono::duration single_kernel_time = (end_time[0] - start_time[0]); + INFO("A single kernel on both GPUs took: " << single_kernel_time.count() << " seconds"); + + SECTION("GPU1 - Standard/ Long Coop, GPU2 - Coop/Standard") { + INFO("GPU " << dev << ": Standard/Long Coop"); + INFO("GPU " << (dev + 1) << ": Coop/Standard"); + fast_gpu = 1; + start_time[1] = std::chrono::system_clock::now(); + HIP_CHECK(hipSetDevice(dev)); + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0], + loops, dev_array[0]); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); + HIP_CHECK(hipSetDevice(dev + 1)); + test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1], + loops, dev_array[1]); + HIP_CHECK(hipGetLastError()); + for (int i = 0; i < 2; i++) { + HIP_CHECK(hipSetDevice(dev + i)); + HIP_CHECK(hipDeviceSynchronize()); + } + end_time[1] = std::chrono::system_clock::now(); + std::chrono::duration serialized_gpu0_time = (end_time[1] - start_time[1]); + INFO("Serialized set of three kernels with GPU0 being long took: " + << serialized_gpu0_time.count() << " seconds"); + + verify_time(single_kernel_time.count(), serialized_gpu0_time.count(), 2.7f, 3.3f); + } + + SECTION("GPU1 - Standard/Coop, GPU2 - Long Coop/Standard") { + INFO("GPU " << dev << ": Standard/Coop"); + INFO("GPU " << (dev + 1) << ": Long Coop/Standard"); + fast_gpu = 0; + start_time[1] = std::chrono::system_clock::now(); + HIP_CHECK(hipSetDevice(dev)); + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0], + loops, dev_array[0]); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, 0)); + HIP_CHECK(hipSetDevice(dev + 1)); + test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1], + loops, dev_array[1]); + HIP_CHECK(hipGetLastError()); + for (int i = 0; i < 2; i++) { + HIP_CHECK(hipSetDevice(dev + i)); + HIP_CHECK(hipDeviceSynchronize()); + } + end_time[1] = std::chrono::system_clock::now(); + std::chrono::duration serialized_gpu1_time = (end_time[1] - start_time[1]); + INFO("Serialized set of three kernels with GPU1 being long took: " + << serialized_gpu1_time.count() << " seconds"); + + verify_time(single_kernel_time.count(), serialized_gpu1_time.count(), 2.7f, 3.3f); + } + + SECTION( + "GPU1 - Standard/Coop, GPU2 - Long Coop/Standard - regular and coop kernel overlap at " + "beginning") { + INFO("GPU " << dev << ": Standard/Coop with multi device no pre sync"); + INFO("GPU " << (dev + 1) << ": Long Coop/Standard with multi device no pre sync"); + fast_gpu = 0; + start_time[1] = std::chrono::system_clock::now(); + HIP_CHECK(hipSetDevice(dev)); + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0], + loops, dev_array[0]); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, + hipCooperativeLaunchMultiDeviceNoPreSync)); + HIP_CHECK(hipSetDevice(dev + 1)); + test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1], + loops, dev_array[1]); + HIP_CHECK(hipGetLastError()); + for (int i = 0; i < 2; i++) { + HIP_CHECK(hipSetDevice(dev + i)); + HIP_CHECK(hipDeviceSynchronize()); + } + end_time[1] = std::chrono::system_clock::now(); + std::chrono::duration pre_overlapped_time = (end_time[1] - start_time[1]); + INFO("Multiple kernels with pre-overlap allowed took: " << pre_overlapped_time.count() + << " seconds"); + + verify_time(single_kernel_time.count(), pre_overlapped_time.count(), 1.7f, 2.3f); + } + + SECTION( + "GPU1 - Standard/Long Coop, GPU2 - Coop/Standard - regular and coop kernel overlap at " + "end") { + INFO("GPU " << dev << ": Standard/Long Coop with multi device no post sync"); + INFO("GPU " << (dev + 1) << ": Coop/Standard with multi device no post sync"); + fast_gpu = 1; + start_time[1] = std::chrono::system_clock::now(); + HIP_CHECK(hipSetDevice(dev)); + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0], + loops, dev_array[0]); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(md_params, 2, + hipCooperativeLaunchMultiDeviceNoPostSync)); + HIP_CHECK(hipSetDevice(dev + 1)); + test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1], + loops, dev_array[1]); + for (int i = 0; i < 2; i++) { + HIP_CHECK(hipSetDevice(dev + i)); + HIP_CHECK(hipDeviceSynchronize()); + } + end_time[1] = std::chrono::system_clock::now(); + std::chrono::duration post_overlapped_time = (end_time[1] - start_time[1]); + INFO("Multiple kernels with post-overlap allowed took: " << post_overlapped_time.count() + << " seconds"); + + verify_time(single_kernel_time.count(), post_overlapped_time.count(), 1.7f, 2.3f); + } + + SECTION( + "GPU1 - Standard/Long Coop, GPU2 - Long Coop/Standard - regular and coop kernel overlap") { + INFO("GPU " << dev << ": Standard/Long Coop with multi device no pre or post sync"); + INFO("GPU " << (dev + 1) << ": Long Coop/Standard with multi device no pre or post sync"); + start_time[1] = std::chrono::system_clock::now(); + HIP_CHECK(hipSetDevice(dev)); + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[0], + loops, dev_array[0]); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipLaunchCooperativeKernelMultiDevice( + md_params, 2, + hipCooperativeLaunchMultiDeviceNoPreSync | hipCooperativeLaunchMultiDeviceNoPostSync)); + HIP_CHECK(hipSetDevice(dev + 1)); + test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + hipLaunchKernelGGL(test_kernel_used, dim3(desired_blocks), dim3(warp_size), 0, streams[1], + loops, dev_array[1]); + HIP_CHECK(hipGetLastError()); + for (int i = 0; i < 2; i++) { + HIP_CHECK(hipSetDevice(dev + i)); + HIP_CHECK(hipDeviceSynchronize()); + } + end_time[1] = std::chrono::system_clock::now(); + std::chrono::duration overlapped_time = (end_time[1] - start_time[1]); + INFO("Multiple kernels with overlap allowed took: " << overlapped_time.count() << " seconds"); + + verify_time(single_kernel_time.count(), overlapped_time.count(), 1.8f, 2.2f); + } + + for (int k = 0; k < 2; ++k) { + HIP_CHECK(hipFree(dev_array[k])); + HIP_CHECK(hipStreamDestroy(streams[k])); + } + } +} + +TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Basic") { + constexpr uint num_kernel_args = 4; + + int device_num = 0; + HIP_CHECK(hipGetDeviceCount(&device_num)); + + size_t buffer_size = kBufferLen * sizeof(int); + + int* A_h = reinterpret_cast(malloc(buffer_size * device_num)); + for (uint32_t i = 0; i < kBufferLen * device_num; ++i) { + A_h[i] = static_cast(i); + } + + std::vector A_d(device_num); + std::vector B_d(device_num); + long* C_d; + std::vector stream(device_num); + + std::vector device_properties(device_num); + + for (int i = 0; i < device_num; i++) { + HIP_CHECK(hipSetDevice(i)); + + // Calculate the device occupancy to know how many blocks can be run concurrently + HIP_CHECK(hipGetDeviceProperties(&device_properties[i], 0)); + if (!device_properties[i].cooperativeMultiDeviceLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + + HIP_CHECK(hipMalloc(&A_d[i], buffer_size)); + HIP_CHECK(hipMemcpy(A_d[i], &A_h[i * kBufferLen], buffer_size, hipMemcpyHostToDevice)); + if (i == 0) { + HIP_CHECK(hipHostMalloc(&C_d, (device_num + 1) * sizeof(long))); + } + + HIP_CHECK(hipStreamCreate(&stream[i])); + HIP_CHECK(hipDeviceSynchronize()); + } + + dim3 dimBlock; + dim3 dimGrid; + dimGrid.x = 1; + dimGrid.y = 1; + dimGrid.z = 1; + dimBlock.x = 64; + dimBlock.y = 1; + dimBlock.z = 1; + + int num_blocks = 0; + uint workgroup = GENERATE(64, 128, 256); + + hipLaunchParams* launch_params_list = new hipLaunchParams[device_num]; + std::vector args(device_num * num_kernel_args); + + for (int i = 0; i < device_num; i++) { + HIP_CHECK(hipSetDevice(i)); + + dimBlock.x = workgroup; + HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( + &num_blocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long))); + + INFO("GPU" << i << " has block size = " << dimBlock.x << " and num blocks per CU " << num_blocks + << "\n"); + + dimGrid.x = device_properties[i].multiProcessorCount * std::min(num_blocks, 32); + + HIP_CHECK(hipMalloc(&B_d[i], dimGrid.x * sizeof(long))); + + args[i * num_kernel_args] = (void*)&A_d[i]; + args[i * num_kernel_args + 1] = (void*)&kBufferLen; + args[i * num_kernel_args + 2] = (void*)&B_d[i]; + args[i * num_kernel_args + 3] = (void*)&C_d; + + launch_params_list[i].func = reinterpret_cast(test_gws); + launch_params_list[i].gridDim = dimGrid; + launch_params_list[i].blockDim = dimBlock; + launch_params_list[i].sharedMem = dimBlock.x * sizeof(long); + launch_params_list[i].stream = stream[i]; + launch_params_list[i].args = &args[i * num_kernel_args]; + } + + HIP_CHECK(hipLaunchCooperativeKernelMultiDevice(launch_params_list, device_num, 0)); + for (int i = 0; i < device_num; i++) { + HIP_CHECK(hipStreamSynchronize(stream[i])); + } + + size_t processed_Dwords = kBufferLen * device_num; + REQUIRE(*C_d == (((long)(processed_Dwords) * (processed_Dwords - 1)) / 2)); + + delete[] launch_params_list; + + HIP_CHECK(hipSetDevice(0)); + HIP_CHECK(hipHostFree(C_d)); + for (int i = 0; i < device_num; i++) { + HIP_CHECK(hipSetDevice(i)); + HIP_CHECK(hipFree(A_d[i])); + HIP_CHECK(hipFree(B_d[i])); + HIP_CHECK(hipStreamDestroy(stream[i])); + } + + free(A_h); +} + +TEST_CASE("Unit_hipLaunchCooperativeKernelMultiDevice_Streams") { + int device_num = 0; + HIP_CHECK(hipGetDeviceCount(&device_num)); + + if (device_num < 2) { + HipTest::HIP_SKIP_TEST("Skipping because devices < 2"); + return; + } + + hipDeviceProp_t device_properties; + for (int i = 0; i < device_num; i++) { + HIP_CHECK(hipGetDeviceProperties(&device_properties, i)); + if (!device_properties.cooperativeMultiDeviceLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + } + + test_multigrid_streams(device_num); +} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernel_old.cc b/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernel_old.cc new file mode 100644 index 0000000000..3c4be35662 --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hipLaunchCooperativeKernel_old.cc @@ -0,0 +1,364 @@ +/* +Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +namespace cg = cooperative_groups; + +static constexpr size_t kBufferLen = 1024 * 1024; + +__global__ void test_gws(int* buf, size_t buf_size, long* tmp_buf, long* result) { + extern __shared__ long tmp[]; + uint offset = blockIdx.x * blockDim.x + threadIdx.x; + uint stride = gridDim.x * blockDim.x; + cg::grid_group gg = cg::this_grid(); + + long sum = 0; + for (uint i = offset; i < buf_size; i += stride) { + sum += buf[i]; + } + tmp[threadIdx.x] = sum; + + __syncthreads(); + + if (threadIdx.x == 0) { + sum = 0; + for (uint i = 0; i < blockDim.x; i++) { + sum += tmp[i]; + } + tmp_buf[blockIdx.x] = sum; + } + + gg.sync(); + + if (offset == 0) { + for (uint i = 1; i < gridDim.x; ++i) { + sum += tmp_buf[i]; + } + *result = sum; + } +} + +__global__ void test_kernel(uint32_t loops, unsigned long long* array, long long totalTicks) { + cg::thread_block tb = cg::this_thread_block(); + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + + for (int i = 0; i < loops; i++) { + long long time_diff = 0; + long long last_clock = clock64(); + do { + long long cur_clock = clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while (time_diff < totalTicks); + tb.sync(); + array[rank] += clock64(); + } +} + +__global__ void test_kernel_gfx11(uint32_t loops, unsigned long long* array, long long totalTicks) { +#if HT_AMD + cg::thread_block tb = cg::this_thread_block(); + unsigned int rank = blockIdx.x * blockDim.x + threadIdx.x; + + for (int i = 0; i < loops; i++) { + long long time_diff = 0; + long long last_clock = wall_clock64(); + do { + long long cur_clock = wall_clock64(); + if (cur_clock > last_clock) { + time_diff += (cur_clock - last_clock); + } + // If it rolls over, we don't know how much to add to catch up. + // So just ignore those slipped cycles. + last_clock = cur_clock; + } while (time_diff < totalTicks); + tb.sync(); + array[rank] += wall_clock64(); + } +#endif +} + +template +static void verifyLeastCapacity(T& single_kernel_time, T& double_kernel_time, + T& triple_kernel_time) { +#if HT_AMD + // hipLaunchCooperativeKernel() follows serialization policy on AMD devices + // Test that the two cooperative kernels took roughly twice as long as the one + REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count()); + REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count()); +#else + // hipLaunchCooperativeKernel() doesn't follow serialization policy on NV devices + // Test that the two cooperative kernels took roughly as long as the one + REQUIRE(double_kernel_time.count() >= 0.8 * single_kernel_time.count()); + REQUIRE(double_kernel_time.count() <= 1.2 * single_kernel_time.count()); +#endif + + // Test that the three kernels together took roughly as long as the two + // cooperative kernels. + REQUIRE(triple_kernel_time.count() <= 1.1 * double_kernel_time.count()); +} + +template +static void verifyHalfCapacity(T& single_kernel_time, T& double_kernel_time, + T& triple_kernel_time) { + // Test that the two cooperative kernels took roughly twice as long as the one + REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count()); + REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count()); + + // Test that the three kernels together took roughly as long as the two + // cooperative kernels. + REQUIRE(triple_kernel_time.count() <= 1.1 * double_kernel_time.count()); +} + +template +static void verifyFullCapacity(T& single_kernel_time, T& double_kernel_time, + T& triple_kernel_time) { + // Test that the two cooperative kernels took roughly twice as long as the one + REQUIRE(double_kernel_time.count() >= 1.8 * single_kernel_time.count()); + REQUIRE(double_kernel_time.count() <= 2.2 * single_kernel_time.count()); + + // Test that the three kernels together took roughly 1.6 times as long as the two + // cooperative kernels. If the first 2 kernels run very fast, the third + // won't share much time with the second kernel. + REQUIRE(triple_kernel_time.count() <= 1.7 * double_kernel_time.count()); +} + +template +static void verify(int tests, T& single_kernel_time, T& double_kernel_time, T& triple_kernel_time) { + switch (tests) { + case 0: + verifyLeastCapacity(single_kernel_time, double_kernel_time, triple_kernel_time); + break; + case 1: + verifyHalfCapacity(single_kernel_time, double_kernel_time, triple_kernel_time); + break; + case 2: + verifyFullCapacity(single_kernel_time, double_kernel_time, triple_kernel_time); + break; + default: + break; + } +} + +static void test_cooperative_streams(int dev, int p_tests) { + hipStream_t streams[3]; + unsigned long long* dev_array[3]; + int loops = 1000; + + HIP_CHECK(hipSetDevice(dev)); + hipDeviceProp_t device_properties; + HIP_CHECK(hipGetDeviceProperties(&device_properties, dev)); + + // Test whether target device supports cooperative groups + if (device_properties.cooperativeLaunch == 0) { + std::cout << "Cooperative group support not available in device " << dev << std::endl; + return; + } + + // We will launch enough waves to fill up all of the GPU + int warp_size = device_properties.warpSize; + int num_sms = device_properties.multiProcessorCount; + long long totalTicks = device_properties.clockRate; + int max_blocks_per_sm = 0; + // Calculate the device occupancy to know how many blocks can be run. + auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; + HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, test_kernel_used, + warp_size, 0)); + int max_active_blocks = max_blocks_per_sm * num_sms; + int coop_blocks = 0; + int reg_blocks = 0; + + switch (p_tests) { + case 0: + // 1 block + coop_blocks = 1; + reg_blocks = 1; + break; + case 1: + // Half capacity + // To make sure the second kernel launched by hipLaunchCooperativeKernel + // is invoked after the first kernel finished + coop_blocks = max_active_blocks / 2 + 1; + // To make sure the third kernel launched by hipLaunchKernelGGL is invoked + // concurrently with the second kernel + reg_blocks = max_active_blocks - coop_blocks; + break; + case 2: + // Full capacity + coop_blocks = max_active_blocks; + reg_blocks = max_active_blocks; + break; + default: + break; + } + + for (int i = 0; i < 3; i++) { + HIP_CHECK(hipStreamCreate(&streams[i])); + } + + // Set up data to pass into the kernel + + for (int i = 0; i < 3; i++) { + HIP_CHECK(hipMalloc(reinterpret_cast(&dev_array[i]), warp_size * sizeof(long long))); + HIP_CHECK(hipMemsetAsync(dev_array[i], 0, warp_size * sizeof(long long), streams[i])); + } + + HIP_CHECK(hipDeviceSynchronize()); + + // Launch the kernels + void* coop_params[3][3]; + for (int i = 0; i < 3; i++) { + coop_params[i][0] = reinterpret_cast(&loops); + coop_params[i][1] = reinterpret_cast(&dev_array[i]); + coop_params[i][2] = reinterpret_cast(&totalTicks); + } + + // We need exclude the the initial launching as it will need time to load code obj. + HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel_used), max_active_blocks, + warp_size, coop_params[0], 0, streams[0])); + HIP_CHECK(hipDeviceSynchronize()); + + // Launching a single cooperative kernel + auto single_start = std::chrono::system_clock::now(); + HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel_used), max_active_blocks, + warp_size, coop_params[0], 0, streams[0])); + HIP_CHECK(hipDeviceSynchronize()); + auto single_end = std::chrono::system_clock::now(); + + std::chrono::duration single_kernel_time = (single_end - single_start); + + // Launching 2 cooperative kernels to different streams + auto double_start = std::chrono::system_clock::now(); + HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel_used), coop_blocks, + warp_size, coop_params[0], 0, streams[0])); + HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel_used), coop_blocks, + warp_size, coop_params[1], 0, streams[1])); + + HIP_CHECK(hipDeviceSynchronize()); + auto double_end = std::chrono::system_clock::now(); + + // Launching 2 cooperative kernels and 1 normal kernel + std::chrono::duration double_kernel_time = (double_end - double_start); + + auto triple_start = std::chrono::system_clock::now(); + HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel_used), coop_blocks, + warp_size, coop_params[0], 0, streams[0])); + HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_kernel_used), coop_blocks, + warp_size, coop_params[1], 0, streams[1])); + hipLaunchKernelGGL(test_kernel_used, dim3(reg_blocks), dim3(warp_size), 0, streams[2], loops, + dev_array[2], totalTicks); + + HIP_CHECK(hipDeviceSynchronize()); + auto triple_end = std::chrono::system_clock::now(); + std::chrono::duration triple_kernel_time = (triple_end - triple_start); + + for (int k = 0; k < 3; ++k) { + HIP_CHECK(hipFree(dev_array[k])); + HIP_CHECK(hipStreamDestroy(streams[k])); + } + + + INFO("A single kernel took : " << single_kernel_time.count() << " seconds"); + INFO("Two cooperative kernels took: " << double_kernel_time.count() << " seconds"); + INFO("Two coop kernels and a third regular kernel took: " << triple_kernel_time.count() + << " seconds"); + + verify(p_tests, single_kernel_time, double_kernel_time, triple_kernel_time); +} + +TEST_CASE("Unit_hipLaunchCooperativeKernel_Basic") { + // Use default device for validating the test + int device; + int *A_h, *A_d; + long* B_d; + long* C_d; + hipDeviceProp_t device_properties; + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); + + if (!device_properties.cooperativeLaunch) { + HipTest::HIP_SKIP_TEST("Device doesn't support cooperative launch!"); + return; + } + + size_t buffer_size = kBufferLen * sizeof(int); + + A_h = reinterpret_cast(malloc(buffer_size)); + for (uint32_t i = 0; i < kBufferLen; ++i) { + A_h[i] = static_cast(i); + } + + HIP_CHECK(hipMalloc(&A_d, buffer_size)); + HIP_CHECK(hipMemcpy(A_d, A_h, buffer_size, hipMemcpyHostToDevice)); + HIP_CHECK(hipHostMalloc(&C_d, sizeof(long))); + + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + + dim3 dimBlock = dim3(1); + dim3 dimGrid = dim3(1); + int numBlocks = 0; + + uint32_t workgroup = GENERATE(32, 64, 128, 256); + + dimBlock.x = workgroup; + + // Calculate the device occupancy to know how many blocks can be run concurrently + HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long))); + + dimGrid.x = device_properties.multiProcessorCount * std::min(numBlocks, 32); + HIP_CHECK(hipMalloc(&B_d, dimGrid.x * sizeof(long))); + + void* params[4]; + params[0] = (void*)&A_d; + params[1] = (void*)&kBufferLen; + params[2] = (void*)&B_d; + params[3] = (void*)&C_d; + + INFO("Testing with grid size = " << dimGrid.x << " and block size = " << dimBlock.x << "\n"); + HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast(test_gws), dimGrid, dimBlock, params, + dimBlock.x * sizeof(long), stream)); + + HIP_CHECK(hipStreamSynchronize(stream)); + + REQUIRE(((unsigned long long)*C_d) == (((unsigned long long)(kBufferLen) * (kBufferLen - 1)) / 2)); + + HIP_CHECK(hipStreamDestroy(stream)); + HIP_CHECK(hipHostFree(C_d)); + HIP_CHECK(hipFree(B_d)); + HIP_CHECK(hipFree(A_d)); + free(A_h); +} + +TEST_CASE("Unit_hipLaunchCooperativeKernel_Streams") { + const auto device = GENERATE(range(0, HipTest::getDeviceCount())); + int p_tests = GENERATE(0, 1, 2); + + test_cooperative_streams(device, p_tests); +} diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/hip_cg_common.hh b/projects/hip-tests/catch/unit/cooperativeGrps/hip_cg_common.hh new file mode 100644 index 0000000000..a041c3d673 --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/hip_cg_common.hh @@ -0,0 +1,68 @@ +/* +Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include +#include + +#define ASSERT_EQUAL(lhs, rhs) HIP_ASSERT(lhs == rhs) +#define ASSERT_LE(lhs, rhs) HIPASSERT(lhs <= rhs) +#define ASSERT_GE(lhs, rhs) HIPASSERT(lhs >= rhs) + +constexpr int MaxGPUs = 8; + +template +void printResults(T* ptr, int size) { + for (int i = 0; i < size; i++) { + std::cout << ptr[i] << " "; + } + std::cout << '\n'; +} + +template +void compareResults(T* cpu, T* gpu, int size) { + for (unsigned int i = 0; i < size / sizeof(T); i++) { + if (cpu[i] != gpu[i]) { + INFO("Results do not match at index " << i); + REQUIRE(cpu[i] == gpu[i]); + } + } +} + + +// Search if the sum exists in the expected results array +template +void verifyResults(T* hPtr, T* dPtr, int size) { + int i = 0, j = 0; + for (i = 0; i < size; i++) { + for (j = 0; j < size; j++) { + if (hPtr[i] == dPtr[j]) { + break; + } + } + if (j == size) { + INFO("Result verification failed!"); + REQUIRE(j != size); + } + } +} \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/gl_interop/CMakeLists.txt b/projects/hip-tests/catch/unit/gl_interop/CMakeLists.txt new file mode 100644 index 0000000000..a270a5a6ab --- /dev/null +++ b/projects/hip-tests/catch/unit/gl_interop/CMakeLists.txt @@ -0,0 +1,30 @@ +set(TEST_SRC + hipGLGetDevices.cc + hipGraphicsGLRegisterBuffer.cc + hipGraphicsGLRegisterImage.cc + hipGraphicsMapResources.cc + hipGraphicsSubResourceGetMappedArray.cc + hipGraphicsResourceGetMappedPointer.cc + hipGraphicsUnmapResources.cc + hipGraphicsUnregisterResource.cc +) + +find_package(OpenGL COMPONENTS OpenGL EGL) +message(STATUS "OpenGL_FOUND: ${OpenGL_FOUND}") +if(NOT OpenGL_FOUND) + message(STATUS "OpenGL not found, OpenGL interop tests not enabled.") + return() +endif() + +find_package(GLUT) +message(STATUS "GLUT_FOUND: ${GLUT_FOUND}") +if(NOT GLUT_FOUND) + message(STATUS "GLUT not found, OpenGL interop tests not enabled.") + return() +endif() + +hip_add_exe_to_target(NAME GLInteropTest + TEST_SRC ${TEST_SRC} + TEST_TARGET_NAME build_tests + COMPILE_OPTIONS -std=c++17) +target_link_libraries(GLInteropTest OpenGL::GL OpenGL::EGL GLUT::GLUT) \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/gl_interop/gl_interop_common.hh b/projects/hip-tests/catch/unit/gl_interop/gl_interop_common.hh new file mode 100644 index 0000000000..9cb31a848e --- /dev/null +++ b/projects/hip-tests/catch/unit/gl_interop/gl_interop_common.hh @@ -0,0 +1,219 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include + +#define GL_GLEXT_PROTOTYPES +#include +#include + +#include +#include + +#include + +class GLBufferObject { + public: + static constexpr size_t kSize = 512 * 512 * 4 * sizeof(float); + + GLBufferObject() { + glGenBuffers(1, &vbo_); + glBindBuffer(GL_ARRAY_BUFFER, vbo_); + glBufferData(GL_ARRAY_BUFFER, kSize, 0, GL_DYNAMIC_DRAW); + glBindBuffer(GL_ARRAY_BUFFER, 0); + REQUIRE(glGetError() == GL_NO_ERROR); + } + + ~GLBufferObject() { glDeleteBuffers(1, &vbo_); } + + operator GLuint() const { return vbo_; } + + private: + GLuint vbo_; +}; + +class GLImageObject { + public: + static constexpr size_t kWidth = 512, kHeight = 512; + + GLImageObject() { + glGenTextures(1, &tex_); + glBindTexture(GL_TEXTURE_2D, tex_); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, kWidth, kHeight, 0, GL_RGBA_INTEGER_EXT, + GL_UNSIGNED_BYTE, NULL); + REQUIRE(glGetError() == GL_NO_ERROR); + } + + ~GLImageObject() { glDeleteTextures(1, &tex_); } + + operator GLuint() const { return tex_; } + + private: + GLuint tex_; +}; + +static std::once_flag glut_init_flag; + +class GLUTContextScopeGuard { + public: + GLUTContextScopeGuard() { + std::call_once(glut_init_flag, &GLUTContextScopeGuard::init); + glut_window_ = glutCreateWindow(""); + } + + ~GLUTContextScopeGuard() { glutDestroyWindow(glut_window_); } + + GLUTContextScopeGuard(const GLUTContextScopeGuard&) = delete; + GLUTContextScopeGuard& operator=(const GLUTContextScopeGuard&) = delete; + + GLUTContextScopeGuard(GLUTContextScopeGuard&&) = delete; + GLUTContextScopeGuard& operator=(GLUTContextScopeGuard&&) = delete; + + private: + int glut_window_; + + static void init() { + static char proc_name[] = ""; + static std::array glut_argv = {proc_name, nullptr}; + static int glut_argc = 1; + + glutInit(&glut_argc, glut_argv.data()); + glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE | GLUT_DEPTH); + glutInitWindowSize(512, 512); + } +}; + +class EGLContextScopeGuard { + public: + EGLContextScopeGuard() { + // 1. Initialize EGL + PFNEGLQUERYDEVICESEXTPROC eglQueryDevicesEXT = + (PFNEGLQUERYDEVICESEXTPROC)eglGetProcAddress("eglQueryDevicesEXT"); + + eglQueryDevicesEXT(egl_devices_.max_size(), egl_devices_.data(), &num_devices_); + + INFO("Detected " << num_devices_ << " devices"); + + PFNEGLGETPLATFORMDISPLAYEXTPROC eglGetPlatformDisplayEXT = + (PFNEGLGETPLATFORMDISPLAYEXTPROC)eglGetProcAddress("eglGetPlatformDisplayEXT"); + + egl_display_ = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, egl_devices_.at(0), 0); + + REQUIRE(eglInitialize(egl_display_, &major_, &minor_)); + + // 2. Select an appropriate configuration + REQUIRE(eglChooseConfig(egl_display_, kConfigAttribs, &egl_config_, 1, &num_configs_)); + + // 3. Create a surface + egl_surface_ = eglCreatePbufferSurface(egl_display_, egl_config_, kPbufferAttribs); + + // 4. Bind the API + REQUIRE(eglBindAPI(EGL_OPENGL_API)); + + // 5. Create a context and make it current + egl_context_ = eglCreateContext(egl_display_, egl_config_, EGL_NO_CONTEXT, NULL); + + REQUIRE(eglMakeCurrent(egl_display_, egl_surface_, egl_surface_, egl_context_)); + } + + ~EGLContextScopeGuard() { + // 6. Terminate EGL when finished + eglTerminate(egl_display_); + } + + EGLContextScopeGuard(const EGLContextScopeGuard&) = delete; + EGLContextScopeGuard& operator=(const EGLContextScopeGuard&) = delete; + + EGLContextScopeGuard(EGLContextScopeGuard&&) = delete; + EGLContextScopeGuard& operator=(EGLContextScopeGuard&&) = delete; + + private: + // clang-format off + static constexpr EGLint kConfigAttribs[] = { + EGL_SURFACE_TYPE, + EGL_PBUFFER_BIT, + EGL_BLUE_SIZE, 8, + EGL_GREEN_SIZE, 8, + EGL_RED_SIZE, 8, + EGL_DEPTH_SIZE, 8, + EGL_RENDERABLE_TYPE, + EGL_OPENGL_BIT, + EGL_NONE + }; + // clang-format on + + static constexpr int kPbufferWidth = 9; + static constexpr int kPbufferHeight = 9; + + static constexpr EGLint kPbufferAttribs[] = { + EGL_WIDTH, kPbufferWidth, EGL_HEIGHT, kPbufferHeight, EGL_NONE, + }; + + std::array egl_devices_; + EGLint num_devices_; + EGLDisplay egl_display_; + EGLint major_, minor_; + EGLint num_configs_; + EGLConfig egl_config_; + EGLSurface egl_surface_; + EGLContext egl_context_; +}; + +class GLContextScopeGuard { + public: + using GLUTContextScopeGuardPtr = std::unique_ptr; + using EGLContextScopeGuardPtr = std::unique_ptr; + using GLContextScopeGuardVariant = + std::variant; + + static constexpr char kEnvarName[] = "GL_CONTEXT_TYPE"; + + GLContextScopeGuard() { + char* val = std::getenv(kEnvarName); + std::string val_str = val == NULL ? "" : val; + + if (val_str.empty() || val_str == "GLUT") { + gl_context_ = std::make_unique(); + } else if (val_str == "EGL") { + gl_context_ = std::make_unique(); + } else { + INFO("Unsupported " << kEnvarName << " value '" << val_str << "'"); + INFO("Supported values are ['GLUT', 'EGL']"); + REQUIRE(false); + } + } + + GLContextScopeGuard(const GLContextScopeGuard&) = delete; + GLContextScopeGuard& operator=(const GLContextScopeGuard&) = delete; + + GLContextScopeGuard(GLContextScopeGuard&&) = delete; + GLContextScopeGuard& operator=(GLContextScopeGuard&&) = delete; + + private: + GLContextScopeGuardVariant gl_context_; +}; \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/gl_interop/hipGLGetDevices.cc b/projects/hip-tests/catch/unit/gl_interop/hipGLGetDevices.cc new file mode 100644 index 0000000000..e9ab9c8854 --- /dev/null +++ b/projects/hip-tests/catch/unit/gl_interop/hipGLGetDevices.cc @@ -0,0 +1,90 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include + +#include "gl_interop_common.hh" + +namespace { +constexpr std::array kDeviceLists{ + hipGLDeviceListAll, hipGLDeviceListCurrentFrame, hipGLDeviceListNextFrame}; +} // anonymous namespace + +TEST_CASE("Unit_hipGLGetDevices_Positive_Basic") { + GLContextScopeGuard gl_context; + + const auto device_list = GENERATE(from_range(begin(kDeviceLists), end(kDeviceLists))); + + const int device_count = HipTest::getDeviceCount(); + + unsigned int gl_device_count = 0; + std::vector gl_devices(device_count, -1); + + HIP_CHECK(hipGLGetDevices(&gl_device_count, gl_devices.data(), device_count, device_list)); + + REQUIRE(gl_device_count == 1); + REQUIRE(gl_devices.at(0) == 0); +} + +TEST_CASE("Unit_hipGLGetDevices_Positive_Parameters") { + GLContextScopeGuard gl_context; + + const int device_count = HipTest::getDeviceCount(); + + unsigned int gl_device_count = 0; + std::vector gl_devices(device_count, -1); + + SECTION("pHipDeviceCount == nullptr") { + HIP_CHECK(hipGLGetDevices(nullptr, gl_devices.data(), device_count, hipGLDeviceListAll)); + REQUIRE(gl_devices.at(0) == 0); + } + + SECTION("pHipDevices == nullptr") { + HIP_CHECK(hipGLGetDevices(&gl_device_count, nullptr, device_count, hipGLDeviceListAll)); + REQUIRE(gl_device_count == 1); + } + + SECTION("hipDeviceCount == 0") { + HIP_CHECK(hipGLGetDevices(&gl_device_count, gl_devices.data(), 0, hipGLDeviceListAll)); + REQUIRE(gl_device_count == 1); + REQUIRE(gl_devices.at(0) == -1); + } +} + +TEST_CASE("Unit_hipGLGetDevices_Negative_Parameters") { + GLContextScopeGuard gl_context; + + const int device_count = HipTest::getDeviceCount(); + + unsigned int gl_device_count = 0; + std::vector gl_devices(device_count, -1); + + SECTION("invalid deviceList") { + HIP_CHECK_ERROR(hipGLGetDevices(&gl_device_count, gl_devices.data(), device_count, + static_cast(-1)), + hipErrorInvalidValue); + REQUIRE(gl_device_count == 0); + REQUIRE(gl_devices.at(0) == -1); + } +} \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/gl_interop/hipGraphicsGLRegisterBuffer.cc b/projects/hip-tests/catch/unit/gl_interop/hipGraphicsGLRegisterBuffer.cc new file mode 100644 index 0000000000..0022a88cff --- /dev/null +++ b/projects/hip-tests/catch/unit/gl_interop/hipGraphicsGLRegisterBuffer.cc @@ -0,0 +1,98 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include + +#include "gl_interop_common.hh" + +namespace { +constexpr std::array kFlags{hipGraphicsRegisterFlagsNone, + hipGraphicsRegisterFlagsReadOnly, + hipGraphicsRegisterFlagsWriteDiscard}; +} // anonymous namespace + +TEST_CASE("Unit_hipGraphicsGLRegisterBuffer_Positive_Basic") { + GLContextScopeGuard gl_context; + + const auto flags = GENERATE(from_range(begin(kFlags), end(kFlags))); + + GLBufferObject vbo; + + hipGraphicsResource* vbo_resource; + + HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, flags)); + + HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource)); +} + +TEST_CASE("Unit_hipGraphicsGLRegisterBuffer_Positive_Register_Twice") { + GLContextScopeGuard gl_context; + + GLBufferObject vbo; + + hipGraphicsResource *vbo_resource_1, *vbo_resource_2; + + HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource_1, vbo, hipGraphicsRegisterFlagsNone)); + HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource_2, vbo, hipGraphicsRegisterFlagsNone)); + + HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource_1)); + HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource_2)); +} + +TEST_CASE("Unit_hipGraphicsGLRegisterBuffer_Negative_Parameters") { + GLContextScopeGuard gl_context; + + GLBufferObject vbo; + + hipGraphicsResource* vbo_resource; + + SECTION("resource == nullptr") { + HIP_CHECK_ERROR(hipGraphicsGLRegisterBuffer(nullptr, vbo, hipGraphicsRegisterFlagsNone), + hipErrorInvalidValue); + } + + SECTION("invalid buffer") { + HIP_CHECK_ERROR( + hipGraphicsGLRegisterBuffer(&vbo_resource, GLuint{}, hipGraphicsRegisterFlagsNone), + hipErrorInvalidValue); + } + + SECTION("invalid flags") { + HIP_CHECK_ERROR( + hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, std::numeric_limits::max()), + hipErrorInvalidValue); + } + + SECTION("flags == hipGraphicsRegisterFlagsSurfaceLoadStore") { + HIP_CHECK_ERROR( + hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsSurfaceLoadStore), + hipErrorInvalidValue); + } + + SECTION("flags == hipGraphicsRegisterFlagsTextureGather") { + HIP_CHECK_ERROR( + hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsTextureGather), + hipErrorInvalidValue); + } +} \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/gl_interop/hipGraphicsGLRegisterImage.cc b/projects/hip-tests/catch/unit/gl_interop/hipGraphicsGLRegisterImage.cc new file mode 100644 index 0000000000..0f0546d71b --- /dev/null +++ b/projects/hip-tests/catch/unit/gl_interop/hipGraphicsGLRegisterImage.cc @@ -0,0 +1,102 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include + +#include "gl_interop_common.hh" + +namespace { +constexpr std::array kFlags{ + hipGraphicsRegisterFlagsNone, hipGraphicsRegisterFlagsReadOnly, + hipGraphicsRegisterFlagsWriteDiscard, hipGraphicsRegisterFlagsSurfaceLoadStore, + hipGraphicsRegisterFlagsTextureGather}; +} // anonymous namespace + +TEST_CASE("Unit_hipGraphicsGLRegisterImage_Positive_Basic") { + GLContextScopeGuard gl_context; + + const auto flags = GENERATE(from_range(begin(kFlags), end(kFlags))); + + GLImageObject tex; + + hipGraphicsResource* tex_resource; + + HIP_CHECK(hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D, flags)); + + HIP_CHECK(hipGraphicsUnregisterResource(tex_resource)); +} + +TEST_CASE("Unit_hipGraphicsGLRegisterImage_Positive_Register_Twice") { + GLContextScopeGuard gl_context; + + GLImageObject tex; + + hipGraphicsResource *tex_resource_1, *tex_resource_2; + + HIP_CHECK(hipGraphicsGLRegisterImage(&tex_resource_1, tex, GL_TEXTURE_2D, + hipGraphicsRegisterFlagsNone)); + HIP_CHECK(hipGraphicsGLRegisterImage(&tex_resource_2, tex, GL_TEXTURE_2D, + hipGraphicsRegisterFlagsNone)); + + HIP_CHECK(hipGraphicsUnregisterResource(tex_resource_1)); + HIP_CHECK(hipGraphicsUnregisterResource(tex_resource_2)); +} + +TEST_CASE("Unit_hipGraphicsGLRegisterImage_Negative_Parameters") { + GLContextScopeGuard gl_context; + + GLImageObject tex; + + hipGraphicsResource* tex_resource; + + SECTION("resource == nullptr") { + HIP_CHECK_ERROR( + hipGraphicsGLRegisterImage(nullptr, tex, GL_TEXTURE_2D, hipGraphicsRegisterFlagsNone), + hipErrorInvalidValue); + } + + SECTION("invalid image") { + HIP_CHECK_ERROR(hipGraphicsGLRegisterImage(&tex_resource, GLuint{}, GL_TEXTURE_2D, + hipGraphicsRegisterFlagsNone), + hipErrorInvalidValue); + } + + SECTION("invalid target") { + HIP_CHECK_ERROR( + hipGraphicsGLRegisterImage(&tex_resource, tex, GL_BUFFER, hipGraphicsRegisterFlagsNone), + hipErrorInvalidValue); + } + + SECTION("target does not match the object") { + HIP_CHECK_ERROR(hipGraphicsGLRegisterImage(&tex_resource, tex, GL_RENDERBUFFER, + hipGraphicsRegisterFlagsNone), + hipErrorInvalidValue); + } + + SECTION("invalid flags") { + HIP_CHECK_ERROR(hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D, + std::numeric_limits::max()), + hipErrorInvalidValue); + } +} \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/gl_interop/hipGraphicsMapResources.cc b/projects/hip-tests/catch/unit/gl_interop/hipGraphicsMapResources.cc new file mode 100644 index 0000000000..26babfaf1c --- /dev/null +++ b/projects/hip-tests/catch/unit/gl_interop/hipGraphicsMapResources.cc @@ -0,0 +1,93 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include + +#include "gl_interop_common.hh" + +TEST_CASE("Unit_hipGraphicsMapResources_Positive_Basic") { + GLContextScopeGuard gl_context; + + GLBufferObject vbo; + GLImageObject tex; + + std::array resources; + + HIP_CHECK(hipGraphicsGLRegisterBuffer(&resources.at(0), vbo, hipGraphicsRegisterFlagsNone)); + HIP_CHECK(hipGraphicsGLRegisterImage(&resources.at(1), tex, GL_TEXTURE_2D, + hipGraphicsRegisterFlagsNone)); + + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + + HIP_CHECK(hipGraphicsMapResources(resources.size(), resources.data(), stream)); + + HIP_CHECK(hipGraphicsUnmapResources(resources.size(), resources.data(), stream)); + + HIP_CHECK(hipStreamDestroy(stream)); + + HIP_CHECK(hipGraphicsUnregisterResource(resources.at(0))); + HIP_CHECK(hipGraphicsUnregisterResource(resources.at(1))); +} + +TEST_CASE("Unit_hipGraphicsMapResources_Negative_Parameters") { + GLContextScopeGuard gl_context; + + GLBufferObject vbo; + + hipGraphicsResource* vbo_resource; + + HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone)); + + SECTION("count == 0") { + HIP_CHECK_ERROR(hipGraphicsMapResources(0, &vbo_resource, 0), hipErrorInvalidValue); + } + + SECTION("resources == nullptr") { + HIP_CHECK_ERROR(hipGraphicsMapResources(1, nullptr, 0), hipErrorInvalidValue); + } + + SECTION("unregistered resource") { + hipGraphicsResource* unregistered_resource; + HIP_CHECK( + hipGraphicsGLRegisterBuffer(&unregistered_resource, vbo, hipGraphicsRegisterFlagsNone)); + HIP_CHECK(hipGraphicsUnregisterResource(unregistered_resource)); + HIP_CHECK_ERROR(hipGraphicsMapResources(1, &unregistered_resource, 0), hipErrorInvalidHandle); + } + + SECTION("already mapped resource") { + HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0)); + HIP_CHECK_ERROR(hipGraphicsMapResources(1, &vbo_resource, 0), hipErrorAlreadyMapped); + HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0)); + } + + SECTION("invalid stream") { + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + HIP_CHECK(hipStreamDestroy(stream)); + HIP_CHECK_ERROR(hipGraphicsMapResources(1, &vbo_resource, stream), hipErrorContextIsDestroyed); + } + + HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource)); +} \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/gl_interop/hipGraphicsResourceGetMappedPointer.cc b/projects/hip-tests/catch/unit/gl_interop/hipGraphicsResourceGetMappedPointer.cc new file mode 100644 index 0000000000..8bdd5c16c8 --- /dev/null +++ b/projects/hip-tests/catch/unit/gl_interop/hipGraphicsResourceGetMappedPointer.cc @@ -0,0 +1,151 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include + +#include "gl_interop_common.hh" + +TEST_CASE("Unit_hipGraphicsResourceGetMappedPointer_Positive_Basic") { + GLContextScopeGuard gl_context; + + GLBufferObject vbo; + + hipGraphicsResource* vbo_resource; + + HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone)); + + HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0)); + + float* buffer_devptr = nullptr; + size_t size = 0; + + HIP_CHECK(hipGraphicsResourceGetMappedPointer(reinterpret_cast(&buffer_devptr), &size, + vbo_resource)); + + REQUIRE(buffer_devptr != nullptr); + REQUIRE(size == vbo.kSize); + + HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0)); + + HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource)); +} + +TEST_CASE("Unit_hipGraphicsResourceGetMappedPointer_Positive_Parameters") { + GLContextScopeGuard gl_context; + + GLBufferObject vbo; + + hipGraphicsResource* vbo_resource; + + HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone)); + + HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0)); + + float* buffer_devptr = nullptr; + size_t size = 0; + + SECTION("devPtr == nullptr") { + HIP_CHECK(hipGraphicsResourceGetMappedPointer(nullptr, &size, vbo_resource)); + REQUIRE(size == vbo.kSize); + } + + SECTION("size == nullptr") { + HIP_CHECK(hipGraphicsResourceGetMappedPointer(reinterpret_cast(&buffer_devptr), nullptr, + vbo_resource)); + REQUIRE(buffer_devptr != nullptr); + } + + HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0)); + + HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource)); +} + +TEST_CASE("Unit_hipGraphicsResourceGetMappedPointer_Negative_Parameters") { + GLContextScopeGuard gl_context; + + GLBufferObject vbo; + + hipGraphicsResource* vbo_resource; + + HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone)); + + HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0)); + + float* buffer_devptr = nullptr; + size_t size = 0; + + SECTION("non-pointer resource") { + GLImageObject tex; + hipGraphicsResource* tex_resource; + + HIP_CHECK(hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D, + hipGraphicsRegisterFlagsNone)); + HIP_CHECK(hipGraphicsMapResources(1, &tex_resource, 0)); + + HIP_CHECK_ERROR(hipGraphicsResourceGetMappedPointer(reinterpret_cast(&buffer_devptr), + &size, tex_resource), + hipErrorNotMappedAsPointer); + + HIP_CHECK(hipGraphicsUnmapResources(1, &tex_resource, 0)); + HIP_CHECK(hipGraphicsUnregisterResource(tex_resource)); + } + + SECTION("unregistered resource") { + hipGraphicsResource* unregistered_resource; + HIP_CHECK( + hipGraphicsGLRegisterBuffer(&unregistered_resource, vbo, hipGraphicsRegisterFlagsNone)); + HIP_CHECK(hipGraphicsUnregisterResource(unregistered_resource)); + HIP_CHECK_ERROR(hipGraphicsResourceGetMappedPointer(reinterpret_cast(&buffer_devptr), + &size, unregistered_resource), + hipErrorContextIsDestroyed); + } + + SECTION("not mapped resource") { + hipGraphicsResource* not_mapped_resource; + HIP_CHECK(hipGraphicsGLRegisterBuffer(¬_mapped_resource, vbo, hipGraphicsRegisterFlagsNone)); + HIP_CHECK_ERROR(hipGraphicsResourceGetMappedPointer(reinterpret_cast(&buffer_devptr), + &size, not_mapped_resource), + hipErrorNotMapped); + HIP_CHECK(hipGraphicsUnregisterResource(not_mapped_resource)); + } + + SECTION("unmapped resource") { + hipGraphicsResource* unmapped_resource; + + HIP_CHECK(hipGraphicsGLRegisterBuffer(&unmapped_resource, vbo, hipGraphicsRegisterFlagsNone)); + + HIP_CHECK(hipGraphicsMapResources(1, &unmapped_resource, 0)); + HIP_CHECK(hipGraphicsUnmapResources(1, &unmapped_resource, 0)); + + HIP_CHECK_ERROR(hipGraphicsResourceGetMappedPointer(reinterpret_cast(&buffer_devptr), + &size, unmapped_resource), + hipErrorNotMapped); + + HIP_CHECK(hipGraphicsUnregisterResource(unmapped_resource)); + } + + HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0)); + + HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource)); +} \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/gl_interop/hipGraphicsSubResourceGetMappedArray.cc b/projects/hip-tests/catch/unit/gl_interop/hipGraphicsSubResourceGetMappedArray.cc new file mode 100644 index 0000000000..cbd165a1ae --- /dev/null +++ b/projects/hip-tests/catch/unit/gl_interop/hipGraphicsSubResourceGetMappedArray.cc @@ -0,0 +1,132 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include + +#include "gl_interop_common.hh" + +TEST_CASE("Unit_hipGraphicsSubResourceGetMappedArray_Positive_Basic") { + GLContextScopeGuard gl_context; + + GLImageObject tex; + + hipGraphicsResource* tex_resource; + + HIP_CHECK( + hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D, hipGraphicsRegisterFlagsNone)); + + HIP_CHECK(hipGraphicsMapResources(1, &tex_resource, 0)); + + hipArray* image_devptr = nullptr; + HIP_CHECK(hipGraphicsSubResourceGetMappedArray(&image_devptr, tex_resource, 0, 0)); + + REQUIRE(image_devptr != nullptr); + + HIP_CHECK(hipGraphicsUnmapResources(1, &tex_resource, 0)); + + HIP_CHECK(hipGraphicsUnregisterResource(tex_resource)); +} + +TEST_CASE("Unit_hipGraphicsSubResourceGetMappedArray_Negative_Parameters") { + GLContextScopeGuard gl_context; + + GLImageObject tex; + + hipGraphicsResource* tex_resource; + + HIP_CHECK( + hipGraphicsGLRegisterImage(&tex_resource, tex, GL_TEXTURE_2D, hipGraphicsRegisterFlagsNone)); + + HIP_CHECK(hipGraphicsMapResources(1, &tex_resource, 0)); + + hipArray* image_devptr = nullptr; + + SECTION("array == nullptr") { + HIP_CHECK(hipGraphicsSubResourceGetMappedArray(nullptr, tex_resource, 0, 0)); + } + + SECTION("non-texture resource") { + GLBufferObject vbo; + hipGraphicsResource* vbo_resource; + + HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone)); + HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0)); + + HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, vbo_resource, 0, 0), + hipErrorNotMappedAsArray); + + HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0)); + HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource)); + } + + SECTION("unregistered resource") { + hipGraphicsResource* unregistered_resource; + HIP_CHECK(hipGraphicsGLRegisterImage(&unregistered_resource, tex, GL_TEXTURE_2D, + hipGraphicsRegisterFlagsNone)); + HIP_CHECK(hipGraphicsUnregisterResource(unregistered_resource)); + HIP_CHECK_ERROR( + hipGraphicsSubResourceGetMappedArray(&image_devptr, unregistered_resource, 0, 0), + hipErrorContextIsDestroyed); + } + + SECTION("not mapped resource") { + hipGraphicsResource* not_mapped_resource; + HIP_CHECK(hipGraphicsGLRegisterImage(¬_mapped_resource, tex, GL_TEXTURE_2D, + hipGraphicsRegisterFlagsNone)); + HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, not_mapped_resource, 0, 0), + hipErrorNotMapped); + HIP_CHECK(hipGraphicsUnregisterResource(not_mapped_resource)); + } + + SECTION("unmapped resource") { + hipGraphicsResource* unmapped_resource; + + HIP_CHECK(hipGraphicsGLRegisterImage(&unmapped_resource, tex, GL_TEXTURE_2D, + hipGraphicsRegisterFlagsNone)); + + HIP_CHECK(hipGraphicsMapResources(1, &unmapped_resource, 0)); + HIP_CHECK(hipGraphicsUnmapResources(1, &unmapped_resource, 0)); + + HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, unmapped_resource, 0, 0), + hipErrorNotMapped); + + HIP_CHECK(hipGraphicsUnregisterResource(unmapped_resource)); + } + + SECTION("invalid arrayIndex") { + HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, tex_resource, + std::numeric_limits::max(), 0), + hipErrorInvalidValue); + } + + SECTION("invalid mipLevel") { + HIP_CHECK_ERROR(hipGraphicsSubResourceGetMappedArray(&image_devptr, tex_resource, 0, + std::numeric_limits::max()), + hipErrorInvalidValue); + } + + HIP_CHECK(hipGraphicsUnmapResources(1, &tex_resource, 0)); + + HIP_CHECK(hipGraphicsUnregisterResource(tex_resource)); +} \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/gl_interop/hipGraphicsUnmapResources.cc b/projects/hip-tests/catch/unit/gl_interop/hipGraphicsUnmapResources.cc new file mode 100644 index 0000000000..529e8d597f --- /dev/null +++ b/projects/hip-tests/catch/unit/gl_interop/hipGraphicsUnmapResources.cc @@ -0,0 +1,66 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include + +#include "gl_interop_common.hh" + +TEST_CASE("Unit_hipGraphicsUnmapResources_Negative_Parameters") { + GLContextScopeGuard gl_context; + + GLBufferObject vbo; + + hipGraphicsResource* vbo_resource; + + HIP_CHECK(hipGraphicsGLRegisterBuffer(&vbo_resource, vbo, hipGraphicsRegisterFlagsNone)); + + HIP_CHECK(hipGraphicsMapResources(1, &vbo_resource, 0)); + + SECTION("count == 0") { + HIP_CHECK_ERROR(hipGraphicsUnmapResources(0, &vbo_resource, 0), hipErrorInvalidValue); + } + + SECTION("resources == nullptr") { + HIP_CHECK_ERROR(hipGraphicsUnmapResources(1, nullptr, 0), hipErrorInvalidValue); + } + + SECTION("not mapped resource") { + hipGraphicsResource* not_mapped_resource; + HIP_CHECK(hipGraphicsGLRegisterBuffer(¬_mapped_resource, vbo, hipGraphicsRegisterFlagsNone)); + HIP_CHECK_ERROR(hipGraphicsUnmapResources(1, ¬_mapped_resource, 0), hipErrorNotMapped); + HIP_CHECK(hipGraphicsUnregisterResource(not_mapped_resource)); + } + + SECTION("invalid stream") { + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + HIP_CHECK(hipStreamDestroy(stream)); + HIP_CHECK_ERROR(hipGraphicsUnmapResources(1, &vbo_resource, stream), + hipErrorContextIsDestroyed); + } + + HIP_CHECK(hipGraphicsUnmapResources(1, &vbo_resource, 0)); + + HIP_CHECK(hipGraphicsUnregisterResource(vbo_resource)); +} \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/gl_interop/hipGraphicsUnregisterResource.cc b/projects/hip-tests/catch/unit/gl_interop/hipGraphicsUnregisterResource.cc new file mode 100644 index 0000000000..10fc1f92b3 --- /dev/null +++ b/projects/hip-tests/catch/unit/gl_interop/hipGraphicsUnregisterResource.cc @@ -0,0 +1,48 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include + +#include "gl_interop_common.hh" + +TEST_CASE("Unit_hipGraphicsUnregisterResource_Negative_Parameters") { + GLContextScopeGuard gl_context; + + GLBufferObject vbo; + + SECTION("already unregistered resource") { + hipGraphicsResource* unregistered_resource; + HIP_CHECK( + hipGraphicsGLRegisterBuffer(&unregistered_resource, vbo, hipGraphicsRegisterFlagsNone)); + HIP_CHECK(hipGraphicsUnregisterResource(unregistered_resource)); + HIP_CHECK_ERROR(hipGraphicsUnregisterResource(unregistered_resource), hipErrorInvalidContext); + } + + SECTION("mapped resource") { + hipGraphicsResource* mapped_resource; + HIP_CHECK(hipGraphicsGLRegisterBuffer(&mapped_resource, vbo, hipGraphicsRegisterFlagsNone)); + HIP_CHECK(hipGraphicsMapResources(1, &mapped_resource, 0)); + HIP_CHECK_ERROR(hipGraphicsUnregisterResource(mapped_resource), hipErrorAlreadyMapped); + } +} \ No newline at end of file diff --git a/projects/hip-tests/catch/unit/graph/CMakeLists.txt b/projects/hip-tests/catch/unit/graph/CMakeLists.txt index 148d92bb88..cc6a0d7935 100644 --- a/projects/hip-tests/catch/unit/graph/CMakeLists.txt +++ b/projects/hip-tests/catch/unit/graph/CMakeLists.txt @@ -103,6 +103,7 @@ set(TEST_SRC hipGraphKernelNodeSetParams.cc hipGraphExecKernelNodeSetParams.cc hipGraphLaunch.cc + hipGraphLaunch_old.cc hipGraphMemcpyNodeSetParams1D.cc hipGraphExecMemcpyNodeSetParamsToSymbol_old.cc hipGraphExecMemcpyNodeSetParamsToSymbol.cc diff --git a/projects/hip-tests/catch/unit/graph/hipGraphAddEventRecordNode.cc b/projects/hip-tests/catch/unit/graph/hipGraphAddEventRecordNode.cc index 44b2895cb0..4f5b3b0daa 100644 --- a/projects/hip-tests/catch/unit/graph/hipGraphAddEventRecordNode.cc +++ b/projects/hip-tests/catch/unit/graph/hipGraphAddEventRecordNode.cc @@ -40,19 +40,26 @@ end. Instantiate and Launch the Graph. Wait for the event to complete. Verify that hipEventElapsedTime() returns error. 6) Validate scenario 2 by running the graph multiple times in a loop (100 times) after instantiation. - 7) Negative Scenarios + 7) Validate that no error is reported when numDeps <= dependencies length + 8) Negative Scenarios - Output node is a nullptr. - Input graph is a nullptr. - Input dependencies is a nullptr. + - Node in dependency is from different graph + - Invalid numNodes + - Duplicate node in dependencies - Input event is a nullptr. - Input graph is uninitialized. - Input event is uninitialized. */ +#include -#include #include +#include #include +#include "graph_tests_common.hh" + /** * Scenario 1: Create s simple graph with just one event record * node and instantiate and launch the graph. @@ -66,8 +73,7 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_Simple") { hipEvent_t event; HIP_CHECK(hipEventCreate(&event)); hipGraphNode_t eventrec; - HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, - event)); + HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event)); // Instantiate and launch the graph HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph)); @@ -82,8 +88,8 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_Simple") { /** * Local test function */ -static void validateAddEventRecordNode(bool measureTime, bool withFlags, - int nstep, unsigned flag = 0) { +static void validateAddEventRecordNode(bool measureTime, bool withFlags, int nstep, + unsigned flag = 0) { constexpr size_t N = 1024; constexpr size_t Nbytes = N * sizeof(int); constexpr auto blocksPerCU = 6; // to hide latency @@ -111,8 +117,7 @@ static void validateAddEventRecordNode(bool measureTime, bool withFlags, memsetParams.elementSize = sizeof(char); memsetParams.width = Nbytes; memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, - &memsetParams)); + HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams)); memset(&memsetParams, 0, sizeof(memsetParams)); memsetParams.dst = reinterpret_cast(B_d); memsetParams.value = 0; @@ -120,38 +125,34 @@ static void validateAddEventRecordNode(bool measureTime, bool withFlags, memsetParams.elementSize = sizeof(char); memsetParams.width = Nbytes; memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memset_B, graph, nullptr, 0, - &memsetParams)); + HIP_CHECK(hipGraphAddMemsetNode(&memset_B, graph, nullptr, 0, &memsetParams)); - void* kernelArgs1[] = {&C_d, &memsetVal, reinterpret_cast(&NElem)}; - kernelNodeParams.func = - reinterpret_cast(HipTest::memsetReverse); + void* kernelArgs1[] = {&C_d, &memsetVal, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::memsetReverse); kernelNodeParams.gridDim = dim3(blocks); kernelNodeParams.blockDim = dim3(threadsPerBlock); kernelNodeParams.sharedMemBytes = 0; kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs1); kernelNodeParams.extra = nullptr; - HIP_CHECK(hipGraphAddKernelNode(&memsetKer_C, graph, nullptr, 0, - &kernelNodeParams)); + HIP_CHECK(hipGraphAddKernelNode(&memsetKer_C, graph, nullptr, 0, &kernelNodeParams)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_A, graph, nullptr, 0, A_d, - A_h, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_A, graph, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_B, graph, nullptr, 0, B_d, - B_h, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D_B, graph, nullptr, 0, B_d, B_h, Nbytes, + hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_C, graph, nullptr, 0, C_h, - C_d, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_C, graph, nullptr, 0, C_h, C_d, Nbytes, + hipMemcpyDeviceToHost)); - void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; - kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); kernelNodeParams.gridDim = dim3(blocks); kernelNodeParams.blockDim = dim3(threadsPerBlock); kernelNodeParams.sharedMemBytes = 0; kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs2); kernelNodeParams.extra = nullptr; - HIP_CHECK(hipGraphAddKernelNode(&ker_vecAdd, graph, nullptr, 0, - &kernelNodeParams)); + HIP_CHECK(hipGraphAddKernelNode(&ker_vecAdd, graph, nullptr, 0, &kernelNodeParams)); hipEvent_t eventstart, eventend; if (withFlags) { HIP_CHECK(hipEventCreateWithFlags(&eventstart, flag)); @@ -161,10 +162,8 @@ static void validateAddEventRecordNode(bool measureTime, bool withFlags, HIP_CHECK(hipEventCreate(&eventend)); } hipGraphNode_t event_start, event_final; - HIP_CHECK(hipGraphAddEventRecordNode(&event_start, graph, nullptr, 0, - eventstart)); - HIP_CHECK(hipGraphAddEventRecordNode(&event_final, graph, nullptr, 0, - eventend)); + HIP_CHECK(hipGraphAddEventRecordNode(&event_start, graph, nullptr, 0, eventstart)); + HIP_CHECK(hipGraphAddEventRecordNode(&event_final, graph, nullptr, 0, eventend)); // Create dependencies HIP_CHECK(hipGraphAddDependencies(graph, &event_start, &memset_A, 1)); HIP_CHECK(hipGraphAddDependencies(graph, &event_start, &memset_B, 1)); @@ -260,7 +259,7 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_TimingDisabled") { HIP_CHECK(hipEventCreateWithFlags(&event_start, hipEventDisableTiming)); HIP_CHECK(hipEventCreateWithFlags(&event_end, hipEventDisableTiming)); // memset node - char *A_d; + char* A_d; hipGraphNode_t memset_A; hipMemsetParams memsetParams{}; HIP_CHECK(hipMalloc(&A_d, Nbytes)); @@ -271,14 +270,11 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_TimingDisabled") { memsetParams.elementSize = sizeof(char); memsetParams.width = Nbytes; memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, - &memsetParams)); + HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams)); hipGraphNode_t event_node_start, event_node_end; - HIP_CHECK(hipGraphAddEventRecordNode(&event_node_start, graph, nullptr, 0, - event_start)); - HIP_CHECK(hipGraphAddEventRecordNode(&event_node_end, graph, nullptr, 0, - event_end)); + HIP_CHECK(hipGraphAddEventRecordNode(&event_node_start, graph, nullptr, 0, event_start)); + HIP_CHECK(hipGraphAddEventRecordNode(&event_node_end, graph, nullptr, 0, event_end)); // Add dependencies between nodes HIP_CHECK(hipGraphAddDependencies(graph, &event_node_start, &memset_A, 1)); HIP_CHECK(hipGraphAddDependencies(graph, &memset_A, &event_node_end, 1)); @@ -290,7 +286,7 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_TimingDisabled") { // Validate hipEventElapsedTime returns error code because timing is // disabled for start and end event nodes. float t; - REQUIRE(hipSuccess != hipEventElapsedTime(&t, event_start, event_end)); + HIP_CHECK_ERROR(hipEventElapsedTime(&t, event_start, event_end), hipErrorInvalidHandle); HIP_CHECK(hipGraphExecDestroy(graphExec)); HIP_CHECK(hipFree(A_d)); @@ -301,44 +297,73 @@ TEST_CASE("Unit_hipGraphAddEventRecordNode_Functional_TimingDisabled") { } /** - * Scenario 7: All negative tests + * Scenario 7: Positive parameter tests */ -TEST_CASE("Unit_hipGraphAddEventRecordNode_Negative") { +TEST_CASE("Unit_hipGraphAddEventRecordNode_Positive_Parameters") { hipGraph_t graph; HIP_CHECK(hipGraphCreate(&graph, 0)); hipEvent_t event; HIP_CHECK(hipEventCreate(&event)); - hipGraphNode_t eventwait; - SECTION("pGraphNode = nullptr") { - REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(nullptr, - graph, nullptr, 0, event)); + hipGraphNode_t eventrec; + + hipGraphNode_t dep_node = nullptr; + hipGraphNode_t dep_node2 = nullptr; + HIP_CHECK(hipGraphAddEmptyNode(&dep_node, graph, nullptr, 0)); + HIP_CHECK(hipGraphAddEmptyNode(&dep_node2, graph, nullptr, 0)); + hipGraphNode_t dep_nodes[] = {dep_node, dep_node2}; + + size_t numDeps = 0; + SECTION("numDependencies is zero, dependencies is not nullptr") { + HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, dep_nodes, 0, event)); + HIP_CHECK(hipGraphNodeGetDependencies(eventrec, nullptr, &numDeps)); + REQUIRE(numDeps == 0); } - SECTION("graph = nullptr") { - REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait, - nullptr, nullptr, 0, event)); + SECTION("numDependencies < dependencies length") { + HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, dep_nodes, 1, event)); + HIP_CHECK(hipGraphNodeGetDependencies(eventrec, nullptr, &numDeps)); + REQUIRE(numDeps == 1); } - SECTION("pDependencies = nullptr and numDependencies != 0") { - REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait, - graph, nullptr, 1, event)); - } - - SECTION("event = nullptr") { - REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait, - graph, nullptr, 0, nullptr)); - } - - SECTION("graph is uninitialized") { - hipGraph_t graph_uninit{}; - REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait, - graph_uninit, nullptr, 0, nullptr)); - } - - SECTION("event is uninitialized") { - hipEvent_t event_uninit{}; - REQUIRE(hipErrorInvalidValue == hipGraphAddEventRecordNode(&eventwait, - graph, nullptr, 0, event_uninit)); + SECTION("numDependencies == dependencies length") { + HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, dep_nodes, 2, event)); + HIP_CHECK(hipGraphNodeGetDependencies(eventrec, nullptr, &numDeps)); + REQUIRE(numDeps == 2); + } + + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipEventDestroy(event)); +} + +/** + * Scenario 8: All negative tests + */ +TEST_CASE("Unit_hipGraphAddEventRecordNode_Negative") { + using namespace std::placeholders; + hipGraph_t graph; + HIP_CHECK(hipGraphCreate(&graph, 0)); + hipEvent_t event; + HIP_CHECK(hipEventCreate(&event)); + hipGraphNode_t eventrec; + + GraphAddNodeCommonNegativeTests(std::bind(hipGraphAddEventRecordNode, _1, _2, _3, _4, event), + graph); + + SECTION("event = nullptr") { + HIP_CHECK_ERROR(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, nullptr), + hipErrorInvalidValue); + } + + SECTION("graph is uninitialized") { + hipGraph_t graph_uninit{}; + HIP_CHECK_ERROR(hipGraphAddEventRecordNode(&eventrec, graph_uninit, nullptr, 0, event), + hipErrorInvalidValue); + } + + SECTION("event is uninitialized") { + hipEvent_t event_uninit{}; + HIP_CHECK_ERROR(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event_uninit), + hipErrorInvalidValue); } HIP_CHECK(hipGraphDestroy(graph)); diff --git a/projects/hip-tests/catch/unit/graph/hipGraphAddEventWaitNode.cc b/projects/hip-tests/catch/unit/graph/hipGraphAddEventWaitNode.cc index a196ac7428..89e7e03d7e 100644 --- a/projects/hip-tests/catch/unit/graph/hipGraphAddEventWaitNode.cc +++ b/projects/hip-tests/catch/unit/graph/hipGraphAddEventWaitNode.cc @@ -32,20 +32,25 @@ both graphs. (100 times). 4) Execute scenario 2 with stream1 = stream2. 5) Repeat scenario 2 for different event flags. - 6) Negative Scenarios + 6) Validate that no error is reported when numDeps <= dependencies length + 7) Negative Scenarios - Pass input node parameter as nullptr. - Pass input graph parameter as nullptr. - Pass input dependency parameter as nullptr. + - Node in dependency is from different graph + - Invalid numNodes + - Duplicate node in dependencies - Pass input event parameter as nullptr. - Pass uninitialized input graph parameter. - Pass uninitialized input event parameter. */ +#include -#include #include +#include #include -#define LEN 512 +#include "graph_tests_common.hh" /** * Scenario 1 @@ -60,13 +65,10 @@ TEST_CASE("Unit_hipGraphAddEventWaitNode_Functional_Simple") { HIP_CHECK(hipEventCreate(&event)); hipGraphNode_t event_rec_node, event_wait_node; // Create a event record node in graph - HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph, nullptr, 0, - event)); + HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph, nullptr, 0, event)); // Create a event wait node in graph - HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0, - event)); - HIP_CHECK(hipGraphAddDependencies(graph, &event_rec_node, - &event_wait_node, 1)); + HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0, event)); + HIP_CHECK(hipGraphAddDependencies(graph, &event_rec_node, &event_wait_node, 1)); // Instantiate and launch the graph HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph)); @@ -80,13 +82,14 @@ TEST_CASE("Unit_hipGraphAddEventWaitNode_Functional_Simple") { /** * Local Function */ -static void validate_hipGraphAddEventWaitNode_internodedep(int test, - int nstep, unsigned flag = hipEventDefault) { - size_t memsize = LEN * sizeof(int); +static void validate_hipGraphAddEventWaitNode_internodedep(int test, int nstep, + unsigned flag = hipEventDefault) { + constexpr size_t N = 1024; + size_t memsize = N * sizeof(int); constexpr auto blocksPerCU = 6; // to hide latency constexpr auto threadsPerBlock = 256; - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, LEN); - size_t NElem{LEN}; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + size_t NElem{N}; hipGraph_t graph1, graph2; hipStream_t streamForGraph1, streamForGraph2; hipGraphExec_t graphExec1, graphExec2; @@ -114,68 +117,57 @@ static void validate_hipGraphAddEventWaitNode_internodedep(int test, HIP_CHECK(hipMalloc(&out_d_g1, memsize)); HIP_CHECK(hipMalloc(&out_d_g2, memsize)); // Initialize host buffer - for (uint32_t i = 0; i < LEN; i++) { + for (uint32_t i = 0; i < N; i++) { inp_h[i] = i; out_h_g1[i] = 0; out_h_g2[i] = 0; } // Graph1 creation ........... // Create event1 record node in graph1 - HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph1, nullptr, 0, - event1)); + HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph1, nullptr, 0, event1)); // Create memcpy and kernel nodes for graph1 hipGraphNode_t memcpyH2D, memcpyD2H_1, kernelnode_1; hipKernelNodeParams kernelNodeParams1{}; - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d, - inp_h, memsize, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_1, graph1, nullptr, 0, - out_h_g1, out_d_g1, memsize, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d, inp_h, memsize, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_1, graph1, nullptr, 0, out_h_g1, out_d_g1, memsize, + hipMemcpyDeviceToHost)); - void* kernelArgs1[] = {&inp_d, &out_d_g1, reinterpret_cast(&NElem)}; - kernelNodeParams1.func = - reinterpret_cast(HipTest::vector_square); + void* kernelArgs1[] = {&inp_d, &out_d_g1, reinterpret_cast(&NElem)}; + kernelNodeParams1.func = reinterpret_cast(HipTest::vector_square); kernelNodeParams1.gridDim = dim3(blocks); kernelNodeParams1.blockDim = dim3(threadsPerBlock); kernelNodeParams1.sharedMemBytes = 0; kernelNodeParams1.kernelParams = reinterpret_cast(kernelArgs1); kernelNodeParams1.extra = nullptr; - HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph1, nullptr, 0, - &kernelNodeParams1)); + HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph1, nullptr, 0, &kernelNodeParams1)); // Create dependencies for graph1 - HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D, - &event_rec_node, 1)); - HIP_CHECK(hipGraphAddDependencies(graph1, &event_rec_node, - &kernelnode_1, 1)); - HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode_1, - &memcpyD2H_1, 1)); + HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D, &event_rec_node, 1)); + HIP_CHECK(hipGraphAddDependencies(graph1, &event_rec_node, &kernelnode_1, 1)); + HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode_1, &memcpyD2H_1, 1)); // Graph2 creation ........... // Create event1 record node in graph2 - HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0, - event1)); + HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0, event1)); // Create memcpy and kernel nodes for graph2 hipGraphNode_t memcpyD2H_2, kernelnode_2; hipKernelNodeParams kernelNodeParams2{}; - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_2, graph2, nullptr, 0, - out_h_g2, out_d_g2, memsize, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_2, graph2, nullptr, 0, out_h_g2, out_d_g2, memsize, + hipMemcpyDeviceToHost)); - void* kernelArgs2[] = {&inp_d, &out_d_g2, reinterpret_cast(&NElem)}; - kernelNodeParams2.func = - reinterpret_cast(HipTest::vector_cubic); + void* kernelArgs2[] = {&inp_d, &out_d_g2, reinterpret_cast(&NElem)}; + kernelNodeParams2.func = reinterpret_cast(HipTest::vector_cubic); kernelNodeParams2.gridDim = dim3(blocks); kernelNodeParams2.blockDim = dim3(threadsPerBlock); kernelNodeParams2.sharedMemBytes = 0; kernelNodeParams2.kernelParams = reinterpret_cast(kernelArgs2); kernelNodeParams2.extra = nullptr; - HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph2, nullptr, 0, - &kernelNodeParams2)); + HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph2, nullptr, 0, &kernelNodeParams2)); // Create dependencies for graph2 - HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node, - &kernelnode_2, 1)); - HIP_CHECK(hipGraphAddDependencies(graph2, &kernelnode_2, - &memcpyD2H_2, 1)); + HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node, &kernelnode_2, 1)); + HIP_CHECK(hipGraphAddDependencies(graph2, &kernelnode_2, &memcpyD2H_2, 1)); // Instantiate and launch the graphs HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); @@ -187,16 +179,16 @@ static void validate_hipGraphAddEventWaitNode_internodedep(int test, HIP_CHECK(hipStreamSynchronize(streamForGraph2)); // Validate output bool btestPassed1 = true; - for (uint32_t i = 0; i < LEN; i++) { - if (out_h_g1[i] != (inp_h[i]*inp_h[i])) { + for (uint32_t i = 0; i < N; i++) { + if (out_h_g1[i] != (inp_h[i] * inp_h[i])) { btestPassed1 = false; break; } } REQUIRE(btestPassed1 == true); bool btestPassed2 = true; - for (uint32_t i = 0; i < LEN; i++) { - if (out_h_g2[i] != (inp_h[i]*inp_h[i]*inp_h[i])) { + for (uint32_t i = 0; i < N; i++) { + if (out_h_g2[i] != (inp_h[i] * inp_h[i] * inp_h[i])) { btestPassed2 = false; break; } @@ -247,55 +239,81 @@ TEST_CASE("Unit_hipGraphAddEventWaitNode_MultGraphOneStrmDependency") { */ TEST_CASE("Unit_hipGraphAddEventWaitNode_differentFlags") { SECTION("flag = hipEventBlockingSync") { - validate_hipGraphAddEventWaitNode_internodedep(0, 1, - hipEventBlockingSync); + validate_hipGraphAddEventWaitNode_internodedep(0, 1, hipEventBlockingSync); } SECTION("graph = hipEventDisableTiming") { - validate_hipGraphAddEventWaitNode_internodedep(0, 1, - hipEventDisableTiming); + validate_hipGraphAddEventWaitNode_internodedep(0, 1, hipEventDisableTiming); } } /** - * Scenario 6 + * Scenario 6: Positive parameter tests */ -TEST_CASE("Unit_hipGraphAddEventWaitNode_Negative") { +TEST_CASE("Unit_hipGraphAddEventWaitNode_Positive_Parameters") { hipGraph_t graph; HIP_CHECK(hipGraphCreate(&graph, 0)); hipEvent_t event; HIP_CHECK(hipEventCreate(&event)); hipGraphNode_t eventwait; - SECTION("pGraphNode = nullptr") { - REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(nullptr, - graph, nullptr, 0, event)); + hipGraphNode_t dep_node = nullptr; + hipGraphNode_t dep_node2 = nullptr; + HIP_CHECK(hipGraphAddEmptyNode(&dep_node, graph, nullptr, 0)); + HIP_CHECK(hipGraphAddEmptyNode(&dep_node2, graph, nullptr, 0)); + hipGraphNode_t dep_nodes[] = {dep_node, dep_node2}; + + size_t numDeps = 0; + SECTION("numDependencies is zero, dependencies is not nullptr") { + HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, dep_nodes, 0, event)); + HIP_CHECK(hipGraphNodeGetDependencies(eventwait, nullptr, &numDeps)); + REQUIRE(numDeps == 0); } - SECTION("graph = nullptr") { - REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait, - nullptr, nullptr, 0, event)); + SECTION("numDependencies < dependencies length") { + HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, dep_nodes, 1, event)); + HIP_CHECK(hipGraphNodeGetDependencies(eventwait, nullptr, &numDeps)); + REQUIRE(numDeps == 1); } - SECTION("pDependencies = nullptr") { - REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait, - graph, nullptr, 1, event)); - } - - SECTION("event = nullptr") { - REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait, - graph, nullptr, 0, nullptr)); - } - - SECTION("graph is uninitialized") { - hipGraph_t graph_uninit{}; - REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait, - graph_uninit, nullptr, 0, event)); - } - - SECTION("event is uninitialized") { - hipEvent_t event_uninit{}; - REQUIRE(hipErrorInvalidValue == hipGraphAddEventWaitNode(&eventwait, - graph, nullptr, 0, event_uninit)); + SECTION("numDependencies == dependencies length") { + HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, dep_nodes, 2, event)); + HIP_CHECK(hipGraphNodeGetDependencies(eventwait, nullptr, &numDeps)); + REQUIRE(numDeps == 2); + } + + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipEventDestroy(event)); +} + +/** + * Scenario 7 + */ +TEST_CASE("Unit_hipGraphAddEventWaitNode_Negative") { + using namespace std::placeholders; + hipGraph_t graph; + HIP_CHECK(hipGraphCreate(&graph, 0)); + hipEvent_t event; + HIP_CHECK(hipEventCreate(&event)); + hipGraphNode_t eventwait; + + GraphAddNodeCommonNegativeTests(std::bind(hipGraphAddEventWaitNode, _1, _2, _3, _4, event), + graph); + + SECTION("event = nullptr") { + HIP_CHECK_ERROR(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, nullptr), + hipErrorInvalidValue); + } + + SECTION("graph is uninitialized") { + hipGraph_t graph_uninit{}; + HIP_CHECK_ERROR(hipGraphAddEventWaitNode(&eventwait, graph_uninit, nullptr, 0, event), + hipErrorInvalidValue); + } + + SECTION("event is uninitialized") { + hipEvent_t event_uninit{}; + HIP_CHECK_ERROR(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event_uninit), + hipErrorInvalidValue); } HIP_CHECK(hipGraphDestroy(graph)); diff --git a/projects/hip-tests/catch/unit/graph/hipGraphEventRecordNodeGetEvent.cc b/projects/hip-tests/catch/unit/graph/hipGraphEventRecordNodeGetEvent.cc index 799841b6cb..49fab00121 100644 --- a/projects/hip-tests/catch/unit/graph/hipGraphEventRecordNodeGetEvent.cc +++ b/projects/hip-tests/catch/unit/graph/hipGraphEventRecordNodeGetEvent.cc @@ -26,11 +26,12 @@ with the event set in hipGraphAddEventRecordNode. - Output event is a nullptr. - Input node is an empty node. - Input node is a memset node. + - Input node is event wait node - Input node is an uninitialized node. */ -#include #include +#include #include /** @@ -42,8 +43,7 @@ static void validateEventRecordNodeGetEvent(unsigned flag) { hipEvent_t event, event_out; HIP_CHECK(hipEventCreateWithFlags(&event, flag)); hipGraphNode_t eventrec; - HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, - event)); + HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event)); HIP_CHECK(hipGraphEventRecordNodeGetEvent(eventrec, &event_out)); // validate set event and get event are same REQUIRE(event == event_out); @@ -77,31 +77,32 @@ TEST_CASE("Unit_hipGraphEventRecordNodeGetEvent_Functional") { TEST_CASE("Unit_hipGraphEventRecordNodeGetEvent_Negative") { hipGraph_t graph; HIP_CHECK(hipGraphCreate(&graph, 0)); - hipEvent_t event, event_out; - HIP_CHECK(hipEventCreate(&event)); - hipGraphNode_t eventrec; - HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, - event)); + hipEvent_t event_out; + hipEvent_t event1, event2; + HIP_CHECK(hipEventCreate(&event1)); + HIP_CHECK(hipEventCreate(&event2)); + hipGraphNode_t eventrec, eventwait; + HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1)); + HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event2)); + SECTION("node = nullptr") { - REQUIRE(hipErrorInvalidValue == hipGraphEventRecordNodeGetEvent(nullptr, - &event_out)); + HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(nullptr, &event_out), hipErrorInvalidValue); } SECTION("event_out = nullptr") { - REQUIRE(hipErrorInvalidValue == hipGraphEventRecordNodeGetEvent(eventrec, - nullptr)); + HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(eventrec, nullptr), hipErrorInvalidValue); } SECTION("input node is empty node") { hipGraphNode_t EmptyGraphNode; HIP_CHECK(hipGraphAddEmptyNode(&EmptyGraphNode, graph, nullptr, 0)); - REQUIRE(hipErrorInvalidValue == - hipGraphEventRecordNodeGetEvent(EmptyGraphNode, &event_out)); + HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(EmptyGraphNode, &event_out), + hipErrorInvalidValue); } SECTION("input node is memset node") { constexpr size_t Nbytes = 1024; - char *A_d; + char* A_d; hipGraphNode_t memset_A; hipMemsetParams memsetParams{}; HIP_CHECK(hipMalloc(&A_d, Nbytes)); @@ -112,19 +113,21 @@ TEST_CASE("Unit_hipGraphEventRecordNodeGetEvent_Negative") { memsetParams.elementSize = sizeof(char); memsetParams.width = Nbytes; memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, - &memsetParams)); - REQUIRE(hipErrorInvalidValue == - hipGraphEventRecordNodeGetEvent(memset_A, &event_out)); + HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams)); + HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(memset_A, &event_out), hipErrorInvalidValue); HIP_CHECK(hipFree(A_d)); } + SECTION("input node is event wait node") { + HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(eventwait, &event_out), hipErrorInvalidValue); + } + SECTION("input node is uninitialized node") { hipGraphNode_t node_unit{}; - REQUIRE(hipErrorInvalidValue == - hipGraphEventRecordNodeGetEvent(node_unit, &event_out)); + HIP_CHECK_ERROR(hipGraphEventRecordNodeGetEvent(node_unit, &event_out), hipErrorInvalidValue); } HIP_CHECK(hipGraphDestroy(graph)); - HIP_CHECK(hipEventDestroy(event)); + HIP_CHECK(hipEventDestroy(event1)); + HIP_CHECK(hipEventDestroy(event2)); } diff --git a/projects/hip-tests/catch/unit/graph/hipGraphEventRecordNodeSetEvent.cc b/projects/hip-tests/catch/unit/graph/hipGraphEventRecordNodeSetEvent.cc index 9c0df129d6..93bb0418ff 100644 --- a/projects/hip-tests/catch/unit/graph/hipGraphEventRecordNodeSetEvent.cc +++ b/projects/hip-tests/catch/unit/graph/hipGraphEventRecordNodeSetEvent.cc @@ -30,14 +30,16 @@ Testcase Scenarios : - Input event parameter is nullptr. - Empty node is passed as input node. - Memset node is passed as input node. + - Event wait node is passed as input node. - Input node is an uninitialized node. - Input event is an uninitialized event. */ -#include #include +#include #include + /** * Local Function: Set Get test */ @@ -49,8 +51,7 @@ static void validateEventRecordNodeSetEvent(unsigned flag) { HIP_CHECK(hipEventCreate(&event1)); HIP_CHECK(hipEventCreateWithFlags(&event2, flag)); hipGraphNode_t eventrec; - HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, - event1)); + HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1)); // Set a different event HIP_CHECK(hipGraphEventRecordNodeSetEvent(eventrec, event2)); HIP_CHECK(hipGraphEventRecordNodeGetEvent(eventrec, &event_out)); @@ -73,11 +74,9 @@ static void setEventWaitNode() { HIP_CHECK(hipEventCreate(&event1)); HIP_CHECK(hipEventCreate(&event2)); hipGraphNode_t eventwait; - HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, - event1)); + HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1)); // Set a different event eventwait using hipGraphEventRecordNodeSetEvent - REQUIRE(hipErrorInvalidValue == - hipGraphEventRecordNodeSetEvent(eventwait, event2)); + HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(eventwait, event2), hipErrorInvalidValue); // Free resources HIP_CHECK(hipGraphDestroy(graph)); HIP_CHECK(hipEventDestroy(event1)); @@ -98,13 +97,11 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_SetEventProperty") { HIP_CHECK(hipEventCreateWithFlags(&event2_end, hipEventDisableTiming)); // Create nodes hipGraphNode_t event_start_rec, event_end_rec; - HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0, - event1_start)); - HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0, - event1_end)); + HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0, event1_start)); + HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0, event1_end)); // Create memset node constexpr size_t Nbytes = 1024; - char *A_d; + char* A_d; hipGraphNode_t memset_A; hipMemsetParams memsetParams{}; HIP_CHECK(hipMalloc(&A_d, Nbytes)); @@ -115,8 +112,7 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_SetEventProperty") { memsetParams.elementSize = sizeof(char); memsetParams.width = Nbytes; memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, - &memsetParams)); + HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams)); // Create dependencies // event_start_rec --> memset_A --> event_end_rec HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec, &memset_A, 1)); @@ -132,8 +128,7 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_SetEventProperty") { // Validate by measuring time difference between event_end_rec & // event_start_rec float t = 0.0f; - REQUIRE(hipSuccess == hipEventElapsedTime(&t, event1_start, - event1_end)); + REQUIRE(hipSuccess == hipEventElapsedTime(&t, event1_start, event1_end)); REQUIRE(t > 0.0f); // Change the event property after instantiation HIP_CHECK(hipGraphEventRecordNodeSetEvent(event_start_rec, event2_start)); @@ -145,8 +140,7 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_SetEventProperty") { // hipErrorInvalidHandle when events are created using // hipEventDisableTiming flag. t = 0.0f; - REQUIRE(hipErrorInvalidHandle == - hipEventElapsedTime(&t, event2_start, event2_end)); + HIP_CHECK_ERROR(hipEventElapsedTime(&t, event2_start, event2_end), hipErrorInvalidHandle); // Free resources HIP_CHECK(hipGraphExecDestroy(graphExec)); HIP_CHECK(hipStreamDestroy(streamForGraph)); @@ -185,28 +179,24 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_Negative") { HIP_CHECK(hipEventCreate(&event1)); HIP_CHECK(hipEventCreate(&event2)); hipGraphNode_t eventrec; - HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, - event1)); + HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1)); SECTION("node = nullptr") { - REQUIRE(hipErrorInvalidValue == hipGraphEventRecordNodeSetEvent(nullptr, - event2)); + HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(nullptr, event2), hipErrorInvalidValue); } SECTION("event_out = nullptr") { - REQUIRE(hipErrorInvalidValue == hipGraphEventRecordNodeSetEvent(eventrec, - nullptr)); + HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(eventrec, nullptr), hipErrorInvalidValue); } SECTION("input node is empty node") { hipGraphNode_t EmptyGraphNode; HIP_CHECK(hipGraphAddEmptyNode(&EmptyGraphNode, graph, nullptr, 0)); - REQUIRE(hipErrorInvalidValue == - hipGraphEventRecordNodeSetEvent(EmptyGraphNode, event2)); + HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(EmptyGraphNode, event2), hipErrorInvalidValue); } SECTION("input node is memset node") { constexpr size_t Nbytes = 1024; - char *A_d; + char* A_d; hipGraphNode_t memset_A; hipMemsetParams memsetParams{}; HIP_CHECK(hipMalloc(&A_d, Nbytes)); @@ -217,10 +207,8 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_Negative") { memsetParams.elementSize = sizeof(char); memsetParams.width = Nbytes; memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, - &memsetParams)); - REQUIRE(hipErrorInvalidValue == - hipGraphEventRecordNodeSetEvent(memset_A, event2)); + HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams)); + HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(memset_A, event2), hipErrorInvalidValue); HIP_CHECK(hipFree(A_d)); } @@ -230,14 +218,12 @@ TEST_CASE("Unit_hipGraphEventRecordNodeSetEvent_Negative") { SECTION("input node is uninitialized node") { hipGraphNode_t node_uninit{}; - REQUIRE(hipErrorInvalidValue == - hipGraphEventRecordNodeSetEvent(node_uninit, event2)); + HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(node_uninit, event2), hipErrorInvalidValue); } SECTION("input event is uninitialized") { hipEvent_t event_uninit{}; - REQUIRE(hipErrorInvalidValue == - hipGraphEventRecordNodeSetEvent(eventrec, event_uninit)); + HIP_CHECK_ERROR(hipGraphEventRecordNodeSetEvent(eventrec, event_uninit), hipErrorInvalidValue); } HIP_CHECK(hipGraphDestroy(graph)); diff --git a/projects/hip-tests/catch/unit/graph/hipGraphEventWaitNodeGetEvent.cc b/projects/hip-tests/catch/unit/graph/hipGraphEventWaitNodeGetEvent.cc index 1dfefdc7cb..74bbf87c87 100644 --- a/projects/hip-tests/catch/unit/graph/hipGraphEventWaitNodeGetEvent.cc +++ b/projects/hip-tests/catch/unit/graph/hipGraphEventWaitNodeGetEvent.cc @@ -26,13 +26,15 @@ with the event set in hipGraphAddEventWaitNode. - Output event parameter is passed as nullptr. - Input node parameter is an empty node. - Input node parameter is a memset node. + - Input node parameter is a event record node. - Input node parameter is an uninitialized node. */ -#include #include +#include #include + /** * Local Function */ @@ -42,8 +44,7 @@ static void validateEventWaitNodeGetEvent(unsigned flag) { hipEvent_t event, event_out; HIP_CHECK(hipEventCreateWithFlags(&event, flag)); hipGraphNode_t eventwait; - HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, - event)); + HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event)); HIP_CHECK(hipGraphEventWaitNodeGetEvent(eventwait, &event_out)); // validate set event and get event are same REQUIRE(event == event_out); @@ -77,31 +78,32 @@ TEST_CASE("Unit_hipGraphEventWaitNodeGetEvent_Functional") { TEST_CASE("Unit_hipGraphEventWaitNodeGetEvent_Negative") { hipGraph_t graph; HIP_CHECK(hipGraphCreate(&graph, 0)); - hipEvent_t event, event_out; - HIP_CHECK(hipEventCreate(&event)); - hipGraphNode_t eventwait; - HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, - event)); + hipEvent_t event_out; + hipEvent_t event1, event2; + HIP_CHECK(hipEventCreate(&event1)); + HIP_CHECK(hipEventCreate(&event2)); + hipGraphNode_t eventrec, eventwait; + HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1)); + HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event2)); + SECTION("node = nullptr") { - REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeGetEvent(nullptr, - &event_out)); + HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(nullptr, &event_out), hipErrorInvalidValue); } SECTION("event_out = nullptr") { - REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeGetEvent(eventwait, - nullptr)); + HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(eventwait, nullptr), hipErrorInvalidValue); } SECTION("input node is empty node") { hipGraphNode_t EmptyGraphNode; HIP_CHECK(hipGraphAddEmptyNode(&EmptyGraphNode, graph, nullptr, 0)); - REQUIRE(hipErrorInvalidValue == - hipGraphEventWaitNodeGetEvent(EmptyGraphNode, &event_out)); + HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(EmptyGraphNode, &event_out), + hipErrorInvalidValue); } SECTION("input node is memset node") { constexpr size_t Nbytes = 1024; - char *A_d; + char* A_d; hipGraphNode_t memset_A; hipMemsetParams memsetParams{}; HIP_CHECK(hipMalloc(&A_d, Nbytes)); @@ -112,19 +114,21 @@ TEST_CASE("Unit_hipGraphEventWaitNodeGetEvent_Negative") { memsetParams.elementSize = sizeof(char); memsetParams.width = Nbytes; memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, - &memsetParams)); - REQUIRE(hipErrorInvalidValue == - hipGraphEventWaitNodeGetEvent(memset_A, &event_out)); + HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams)); + HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(memset_A, &event_out), hipErrorInvalidValue); HIP_CHECK(hipFree(A_d)); } + SECTION("input node is event record node") { + HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(eventrec, &event_out), hipErrorInvalidValue); + } + SECTION("input node is uninitialized") { hipGraphNode_t node_uninit{}; - REQUIRE(hipErrorInvalidValue == - hipGraphEventWaitNodeGetEvent(node_uninit, &event_out)); + HIP_CHECK_ERROR(hipGraphEventWaitNodeGetEvent(node_uninit, &event_out), hipErrorInvalidValue); } HIP_CHECK(hipGraphDestroy(graph)); - HIP_CHECK(hipEventDestroy(event)); + HIP_CHECK(hipEventDestroy(event1)); + HIP_CHECK(hipEventDestroy(event2)); } diff --git a/projects/hip-tests/catch/unit/graph/hipGraphEventWaitNodeSetEvent.cc b/projects/hip-tests/catch/unit/graph/hipGraphEventWaitNodeSetEvent.cc index 8751ffe8ed..6c3132262d 100644 --- a/projects/hip-tests/catch/unit/graph/hipGraphEventWaitNodeSetEvent.cc +++ b/projects/hip-tests/catch/unit/graph/hipGraphEventWaitNodeSetEvent.cc @@ -37,11 +37,10 @@ Testcase Scenarios : - Input event is an uninitialized node. */ -#include #include +#include #include -#define LEN 512 /** * Local Function @@ -54,8 +53,7 @@ static void validateEventWaitNodeSetEvent(unsigned flag) { HIP_CHECK(hipEventCreate(&event1)); HIP_CHECK(hipEventCreateWithFlags(&event2, flag)); hipGraphNode_t eventwait; - HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, - event1)); + HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1)); // Set a different event HIP_CHECK(hipGraphEventWaitNodeSetEvent(eventwait, event2)); HIP_CHECK(hipGraphEventWaitNodeGetEvent(eventwait, &event_out)); @@ -78,11 +76,9 @@ static void setEventRecordNode() { HIP_CHECK(hipEventCreate(&event1)); HIP_CHECK(hipEventCreate(&event2)); hipGraphNode_t eventrec; - HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, - event1)); + HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1)); // Set a different event eventrec using hipGraphEventWaitNodeSetEvent - REQUIRE(hipErrorInvalidValue == - hipGraphEventWaitNodeSetEvent(eventrec, event2)); + HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(eventrec, event2), hipErrorInvalidValue); // Free resources HIP_CHECK(hipGraphDestroy(graph)); HIP_CHECK(hipEventDestroy(event1)); @@ -93,11 +89,12 @@ static void setEventRecordNode() { * Scenario 2 */ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_SetProp") { - size_t memsize = LEN * sizeof(int); + constexpr size_t N = 512; + size_t memsize = N * sizeof(int); constexpr auto blocksPerCU = 6; // to hide latency constexpr auto threadsPerBlock = 256; - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, LEN); - size_t NElem{LEN}; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + size_t NElem{N}; hipGraph_t graph1, graph2; hipStream_t streamForGraph1, streamForGraph2; hipGraphExec_t graphExec1, graphExec2; @@ -123,67 +120,56 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_SetProp") { HIP_CHECK(hipMalloc(&out_d_g1, memsize)); HIP_CHECK(hipMalloc(&out_d_g2, memsize)); // Initialize host buffer - for (uint32_t i = 0; i < LEN; i++) { + for (uint32_t i = 0; i < N; i++) { inp_h[i] = i; out_h_g1[i] = 0; out_h_g2[i] = 0; } // Graph1 creation ........... // Create event1 record node in graph1 - HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph1, nullptr, 0, - event1)); + HIP_CHECK(hipGraphAddEventRecordNode(&event_rec_node, graph1, nullptr, 0, event1)); // Create memcpy and kernel nodes for graph1 hipGraphNode_t memcpyH2D, memcpyD2H_1, kernelnode_1; hipKernelNodeParams kernelNodeParams1{}; - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d, - inp_h, memsize, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_1, graph1, nullptr, 0, - out_h_g1, out_d_g1, memsize, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d, inp_h, memsize, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_1, graph1, nullptr, 0, out_h_g1, out_d_g1, memsize, + hipMemcpyDeviceToHost)); - void* kernelArgs1[] = {&inp_d, &out_d_g1, reinterpret_cast(&NElem)}; - kernelNodeParams1.func = - reinterpret_cast(HipTest::vector_square); + void* kernelArgs1[] = {&inp_d, &out_d_g1, reinterpret_cast(&NElem)}; + kernelNodeParams1.func = reinterpret_cast(HipTest::vector_square); kernelNodeParams1.gridDim = dim3(blocks); kernelNodeParams1.blockDim = dim3(threadsPerBlock); kernelNodeParams1.sharedMemBytes = 0; kernelNodeParams1.kernelParams = reinterpret_cast(kernelArgs1); kernelNodeParams1.extra = nullptr; - HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph1, nullptr, 0, - &kernelNodeParams1)); + HIP_CHECK(hipGraphAddKernelNode(&kernelnode_1, graph1, nullptr, 0, &kernelNodeParams1)); // Create dependencies for graph1 - HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D, - &event_rec_node, 1)); - HIP_CHECK(hipGraphAddDependencies(graph1, &event_rec_node, - &kernelnode_1, 1)); - HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode_1, - &memcpyD2H_1, 1)); + HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D, &event_rec_node, 1)); + HIP_CHECK(hipGraphAddDependencies(graph1, &event_rec_node, &kernelnode_1, 1)); + HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode_1, &memcpyD2H_1, 1)); // Graph2 creation ........... // Create event1 record node in graph2 - HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0, - event1)); + HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0, event1)); // Create memcpy and kernel nodes for graph2 hipGraphNode_t memcpyD2H_2, kernelnode_2; hipKernelNodeParams kernelNodeParams2{}; - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_2, graph2, nullptr, 0, - out_h_g2, out_d_g2, memsize, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H_2, graph2, nullptr, 0, out_h_g2, out_d_g2, memsize, + hipMemcpyDeviceToHost)); - void* kernelArgs2[] = {&inp_d, &out_d_g2, reinterpret_cast(&NElem)}; - kernelNodeParams2.func = - reinterpret_cast(HipTest::vector_cubic); + void* kernelArgs2[] = {&inp_d, &out_d_g2, reinterpret_cast(&NElem)}; + kernelNodeParams2.func = reinterpret_cast(HipTest::vector_cubic); kernelNodeParams2.gridDim = dim3(blocks); kernelNodeParams2.blockDim = dim3(threadsPerBlock); kernelNodeParams2.sharedMemBytes = 0; kernelNodeParams2.kernelParams = reinterpret_cast(kernelArgs2); kernelNodeParams2.extra = nullptr; - HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph2, nullptr, 0, - &kernelNodeParams2)); + HIP_CHECK(hipGraphAddKernelNode(&kernelnode_2, graph2, nullptr, 0, &kernelNodeParams2)); // Create dependencies for graph2 - HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node, - &kernelnode_2, 1)); - HIP_CHECK(hipGraphAddDependencies(graph2, &kernelnode_2, - &memcpyD2H_2, 1)); + HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node, &kernelnode_2, 1)); + HIP_CHECK(hipGraphAddDependencies(graph2, &kernelnode_2, &memcpyD2H_2, 1)); // Instantiate and launch the graphs HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); @@ -198,16 +184,16 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_SetProp") { HIP_CHECK(hipStreamSynchronize(streamForGraph2)); // Validate output bool btestPassed1 = true; - for (uint32_t i = 0; i < LEN; i++) { - if (out_h_g1[i] != (inp_h[i]*inp_h[i])) { + for (uint32_t i = 0; i < N; i++) { + if (out_h_g1[i] != (inp_h[i] * inp_h[i])) { btestPassed1 = false; break; } } REQUIRE(btestPassed1 == true); bool btestPassed2 = true; - for (uint32_t i = 0; i < LEN; i++) { - if (out_h_g2[i] != (inp_h[i]*inp_h[i]*inp_h[i])) { + for (uint32_t i = 0; i < N; i++) { + if (out_h_g2[i] != (inp_h[i] * inp_h[i] * inp_h[i])) { btestPassed2 = false; break; } @@ -256,28 +242,24 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_Negative") { HIP_CHECK(hipEventCreate(&event1)); HIP_CHECK(hipEventCreate(&event2)); hipGraphNode_t eventwait; - HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, - event1)); + HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1)); SECTION("node = nullptr") { - REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeSetEvent( - nullptr, event2)); + HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(nullptr, event2), hipErrorInvalidValue); } SECTION("event = nullptr") { - REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeSetEvent( - eventwait, nullptr)); + HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(eventwait, nullptr), hipErrorInvalidValue); } SECTION("input node is empty node") { hipGraphNode_t EmptyGraphNode; HIP_CHECK(hipGraphAddEmptyNode(&EmptyGraphNode, graph, nullptr, 0)); - REQUIRE(hipErrorInvalidValue == - hipGraphEventWaitNodeSetEvent(EmptyGraphNode, event2)); + HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(EmptyGraphNode, event2), hipErrorInvalidValue); } SECTION("input node is memset node") { constexpr size_t Nbytes = 1024; - char *A_d; + char* A_d; hipGraphNode_t memset_A; hipMemsetParams memsetParams{}; HIP_CHECK(hipMalloc(&A_d, Nbytes)); @@ -288,10 +270,8 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_Negative") { memsetParams.elementSize = sizeof(char); memsetParams.width = Nbytes; memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, - &memsetParams)); - REQUIRE(hipErrorInvalidValue == - hipGraphEventWaitNodeSetEvent(memset_A, event2)); + HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams)); + HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(memset_A, event2), hipErrorInvalidValue); HIP_CHECK(hipFree(A_d)); } @@ -301,14 +281,12 @@ TEST_CASE("Unit_hipGraphEventWaitNodeSetEvent_Negative") { SECTION("input node is uninitialized node") { hipGraphNode_t node_uninit{}; - REQUIRE(hipErrorInvalidValue == - hipGraphEventWaitNodeSetEvent(node_uninit, event2)); + HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(node_uninit, event2), hipErrorInvalidValue); } SECTION("input event is uninitialized") { hipEvent_t event_uninit{}; - REQUIRE(hipErrorInvalidValue == hipGraphEventWaitNodeSetEvent( - eventwait, event_uninit)); + HIP_CHECK_ERROR(hipGraphEventWaitNodeSetEvent(eventwait, event_uninit), hipErrorInvalidValue); } HIP_CHECK(hipGraphDestroy(graph)); diff --git a/projects/hip-tests/catch/unit/graph/hipGraphExecDestroy.cc b/projects/hip-tests/catch/unit/graph/hipGraphExecDestroy.cc index 7ed2e0d718..debeb32a52 100644 --- a/projects/hip-tests/catch/unit/graph/hipGraphExecDestroy.cc +++ b/projects/hip-tests/catch/unit/graph/hipGraphExecDestroy.cc @@ -20,26 +20,51 @@ THE SOFTWARE. #include /** -Negative Testcase Scenarios : -1) Pass hipGraphExecDestroy with nullptr. -2) Pass hipGraphExecDestroy with un-initilze structure. -3) Destroy graph before exec-graph destroyed and verify no crash occurs. -*/ + * @addtogroup hipGraphExecDestroy hipGraphExecDestroy + * @{ + * @ingroup GraphTest + * `hipGraphExecDestroy(hipGraphExec_t graphExec)` - + * Destroys an executable graph + */ + +/** + * Test Description + * ------------------------ + * - Test to verify API behavior with invalid arguments: + * -# GraphExec is nullptr + * -# GraphExec is uninitialized + * Test source + * ------------------------ + * - unit/graph/hipGraphExecDestroy.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_hipGraphExecDestroy_Negative_Parameters") { -TEST_CASE("Unit_hipGraphExecDestroy_Negative") { - hipError_t ret; SECTION("Pass hipGraphExecDestroy with nullptr") { - ret = hipGraphExecDestroy(nullptr); - REQUIRE(hipErrorInvalidValue == ret); + HIP_CHECK_ERROR(hipGraphExecDestroy(nullptr), hipErrorInvalidValue); } + SECTION("Pass hipGraphExecDestroy with un-initilze structure") { - hipGraphExec_t graphExec{}; - ret = hipGraphExecDestroy(graphExec); - REQUIRE(hipErrorInvalidValue == ret); + hipGraphExec_t graph_exec{}; + HIP_CHECK_ERROR(hipGraphExecDestroy(graph_exec), hipErrorInvalidValue); } } -TEST_CASE("Unit_hipGraphExecDestroy_Sequence") { +/** + * Test Description + * ------------------------ + * - Basic positive test for hipGraphExecDestroy + * - create an executable graph and then destroy it + * Test source + * ------------------------ + * - unit/graph/hipGraphExecDestroy.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_hipGraphExecDestroy_Positive_Basic") { hipGraph_t graph; hipGraphExec_t graphExec; hipStream_t streamForGraph; @@ -70,4 +95,3 @@ TEST_CASE("Unit_hipGraphExecDestroy_Sequence") { HIP_CHECK(hipGraphExecDestroy(graphExec)); HIP_CHECK(hipStreamDestroy(streamForGraph)); } - diff --git a/projects/hip-tests/catch/unit/graph/hipGraphExecEventRecordNodeSetEvent.cc b/projects/hip-tests/catch/unit/graph/hipGraphExecEventRecordNodeSetEvent.cc index 5a1cbe9997..e820dc1cdd 100644 --- a/projects/hip-tests/catch/unit/graph/hipGraphExecEventRecordNodeSetEvent.cc +++ b/projects/hip-tests/catch/unit/graph/hipGraphExecEventRecordNodeSetEvent.cc @@ -33,7 +33,12 @@ Testcase Scenarios : the graph to create an executable graph. Change the event in the executable graph to event2. Verify that the event record node still contains event1. - 3) Negative Scenarios + 3) Scenario to verify that hipGraphExecEventRecordNodeSetEvent can set event + created on different device. Create an event record node with event1 and add it to graph. + Instantiate the graph to create an executable graph. Call the API to change the event in the + executable graph to event2 which has been created on different device. Verify that graph can be + launched and no error is reported. + 4) Negative Scenarios - Input executable graph is a nullptr. - Input node is a nullptr. - Input event to set is a nullptr. @@ -45,27 +50,26 @@ Testcase Scenarios : - Input node is a event wait node. */ -#include #include +#include #include -#define GRID_DIM 512 -#define BLK_DIM 512 -#define LEN (GRID_DIM * BLK_DIM) - /** * Kernel Functions to copy. */ -static __global__ void copy_ker_func(int* a, int* b) { - int tx = blockIdx.x*blockDim.x + threadIdx.x; - if (tx < LEN) b[tx] = a[tx]; +static __global__ void copy_ker_func(int* a, int* b, size_t N) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + if (tx < N) b[tx] = a[tx]; } /** * Scenario 1: Functional scenario (See description Above) */ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") { - size_t memsize = LEN*sizeof(int); + constexpr size_t gridSize = 512; + constexpr size_t blockSize = 512; + constexpr size_t N = gridSize * blockSize; + size_t memsize = N * sizeof(int); hipGraph_t graph; HIP_CHECK(hipGraphCreate(&graph, 0)); // Create events @@ -75,10 +79,8 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") { HIP_CHECK(hipEventCreate(&event2_end)); // Create nodes with event_start and event1_end hipGraphNode_t event_start_rec, event_end_rec; - HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0, - event_start)); - HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0, - event1_end)); + HIP_CHECK(hipGraphAddEventRecordNode(&event_start_rec, graph, nullptr, 0, event_start)); + HIP_CHECK(hipGraphAddEventRecordNode(&event_end_rec, graph, nullptr, 0, event1_end)); int *inp_h, *inp_d, *out_h, *out_d; // Allocate host buffers inp_h = reinterpret_cast(malloc(memsize)); @@ -89,7 +91,7 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") { HIP_CHECK(hipMalloc(&inp_d, memsize)); HIP_CHECK(hipMalloc(&out_d, memsize)); // Initialize host buffer - for (uint32_t i = 0; i < LEN; i++) { + for (uint32_t i = 0; i < N; i++) { inp_h[i] = i; out_h[i] = 0; } @@ -97,44 +99,39 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") { // Create memcpy and kernel nodes for graph hipGraphNode_t memcpyH2D, memcpyD2H, kernelnode; hipKernelNodeParams kernelNodeParams{}; - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr, 0, inp_d, - inp_h, memsize, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nullptr, 0, - out_h, out_d, memsize, hipMemcpyDeviceToHost)); - void* kernelArgs1[] = {&inp_d, &out_d}; - kernelNodeParams.func = reinterpret_cast(copy_ker_func); - kernelNodeParams.gridDim = dim3(GRID_DIM); - kernelNodeParams.blockDim = dim3(BLK_DIM); + size_t NElem{N}; + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr, 0, inp_d, inp_h, memsize, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nullptr, 0, out_h, out_d, memsize, + hipMemcpyDeviceToHost)); + void* kernelArgs1[] = {&inp_d, &out_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(copy_ker_func); + kernelNodeParams.gridDim = dim3(gridSize); + kernelNodeParams.blockDim = dim3(blockSize); kernelNodeParams.sharedMemBytes = 0; kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs1); kernelNodeParams.extra = nullptr; - HIP_CHECK(hipGraphAddKernelNode(&kernelnode, graph, nullptr, 0, - &kernelNodeParams)); + HIP_CHECK(hipGraphAddKernelNode(&kernelnode, graph, nullptr, 0, &kernelNodeParams)); // Create dependencies for graph - HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec, - &memcpyH2D, 1)); - HIP_CHECK(hipGraphAddDependencies(graph, &memcpyH2D, - &kernelnode, 1)); - HIP_CHECK(hipGraphAddDependencies(graph, &kernelnode, - &memcpyD2H, 1)); - HIP_CHECK(hipGraphAddDependencies(graph, &memcpyD2H, - &event_end_rec, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &event_start_rec, &memcpyH2D, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &memcpyH2D, &kernelnode, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &kernelnode, &memcpyD2H, 1)); + HIP_CHECK(hipGraphAddDependencies(graph, &memcpyD2H, &event_end_rec, 1)); // Instantiate and launch the graph hipStream_t streamForGraph; hipGraphExec_t graphExec; HIP_CHECK(hipStreamCreate(&streamForGraph)); HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); // Change the event at event_end_rec node to event2_end - HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, - event_end_rec, event2_end)); + HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, event_end_rec, event2_end)); HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph)); // Wait for graph to complete HIP_CHECK(hipStreamSynchronize(streamForGraph)); // Validate output bool btestPassed = true; - for (uint32_t i = 0; i < LEN; i++) { + for (uint32_t i = 0; i < N; i++) { if (out_h[i] != inp_h[i]) { btestPassed = false; break; @@ -147,8 +144,7 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Functional") { REQUIRE(t > 0.0f); // Since event1_end is never recorded, hipEventElapsedTime // should return error code. - REQUIRE(hipErrorInvalidResourceHandle == - hipEventElapsedTime(&t, event_start, event1_end)); + HIP_CHECK_ERROR(hipEventElapsedTime(&t, event_start, event1_end), hipErrorInvalidResourceHandle); // Free resources HIP_CHECK(hipGraphExecDestroy(graphExec)); HIP_CHECK(hipStreamDestroy(streamForGraph)); @@ -173,12 +169,10 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_VerifyEventNotChanged") { HIP_CHECK(hipEventCreate(&event1)); HIP_CHECK(hipEventCreate(&event2)); hipGraphNode_t eventrec; - HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, - event1)); + HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1)); hipGraphExec_t graphExec; HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); - HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, - eventrec, event2)); + HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, event2)); HIP_CHECK(hipGraphEventRecordNodeGetEvent(eventrec, &event_out)); // validate set event and get event are same REQUIRE(event1 == event_out); @@ -190,7 +184,48 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_VerifyEventNotChanged") { } /** - * Scenario 3: Negative Tests + * Scenario 3: This test verifies event in node of the executable graph can be changed to event on + * different device + */ +TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Positive_DifferentDevices") { + const auto device_count = HipTest::getDeviceCount(); + if (device_count < 2) { + HipTest::HIP_SKIP_TEST("Skipping because devices < 2"); + return; + } + hipGraphExec_t graphExec; + hipStream_t streamForGraph; + hipGraph_t graph; + hipEvent_t event1, event2; + + HIP_CHECK(hipSetDevice(0)); + HIP_CHECK(hipEventCreate(&event1)); + HIP_CHECK(hipSetDevice(1)); + HIP_CHECK(hipEventCreate(&event2)); + + HIP_CHECK(hipSetDevice(0)); + hipGraphNode_t eventrec; + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1)); + + // Verify event on different device can be set in graphExec + // Instantiate and launch the graph + HIP_CHECK(hipStreamCreate(&streamForGraph)); + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, event2)); + HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph)); + // Wait for graph to complete + HIP_CHECK(hipStreamSynchronize(streamForGraph)); + // Free resources + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipStreamDestroy(streamForGraph)); + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipEventDestroy(event2)); + HIP_CHECK(hipEventDestroy(event1)) +} + +/** + * Scenario 4: Negative Parameter Tests */ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Negative") { hipGraph_t graph; @@ -199,11 +234,10 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Negative") { HIP_CHECK(hipEventCreate(&event1)); HIP_CHECK(hipEventCreate(&event2)); hipGraphNode_t eventrec; - HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, - event1)); + HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1)); // Create memset constexpr size_t Nbytes = 1024; - char *A_d; + char* A_d; hipGraphNode_t memset_A; hipMemsetParams memsetParams{}; HIP_CHECK(hipMalloc(&A_d, Nbytes)); @@ -219,66 +253,61 @@ TEST_CASE("Unit_hipGraphExecEventRecordNodeSetEvent_Negative") { HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); SECTION("hGraphExec = nullptr") { - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventRecordNodeSetEvent(nullptr, eventrec, event2)); + HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(nullptr, eventrec, event2), + hipErrorInvalidValue); } SECTION("hNode = nullptr") { - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventRecordNodeSetEvent(graphExec, nullptr, event2)); + HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, nullptr, event2), + hipErrorInvalidValue); } SECTION("event = nullptr") { - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, nullptr)); + HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, nullptr), + hipErrorInvalidValue); } SECTION("hGraphExec is uninitialized") { hipGraphExec_t graphExec1{}; - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventRecordNodeSetEvent(graphExec1, eventrec, event2)); + HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec1, eventrec, event2), + hipErrorInvalidValue); } SECTION("hNode is uninitialized") { hipGraphNode_t dummy{}; - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventRecordNodeSetEvent(graphExec, dummy, event2)); + HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, dummy, event2), + hipErrorInvalidValue); } SECTION("event is uninitialized") { hipEvent_t event_dummy{}; - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, - event_dummy)); + HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, eventrec, event_dummy), + hipErrorInvalidValue); } SECTION("event record node does not exist") { hipGraph_t graph1; HIP_CHECK(hipGraphCreate(&graph1, 0)); - HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph1, nullptr, 0, - &memsetParams)); + HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph1, nullptr, 0, &memsetParams)); hipGraphExec_t graphExec1; HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventRecordNodeSetEvent(graphExec1, eventrec, event2)); + HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec1, eventrec, event2), + hipErrorInvalidValue); HIP_CHECK(hipGraphExecDestroy(graphExec1)); HIP_CHECK(hipGraphDestroy(graph1)); } SECTION("pass memset node as hNode") { - HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, - &memsetParams)); - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventRecordNodeSetEvent(graphExec, memset_A, event2)); + HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams)); + HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, memset_A, event2), + hipErrorInvalidValue); } SECTION("pass event wait node as hNode") { hipGraphNode_t event_wait_node; - HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0, - event1)); - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventRecordNodeSetEvent(graphExec, event_wait_node, - event2)); + HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph, nullptr, 0, event1)); + HIP_CHECK_ERROR(hipGraphExecEventRecordNodeSetEvent(graphExec, event_wait_node, event2), + hipErrorInvalidValue); } HIP_CHECK(hipFree(A_d)); diff --git a/projects/hip-tests/catch/unit/graph/hipGraphExecEventWaitNodeSetEvent.cc b/projects/hip-tests/catch/unit/graph/hipGraphExecEventWaitNodeSetEvent.cc index f8ee63d6f8..529f8df3c1 100644 --- a/projects/hip-tests/catch/unit/graph/hipGraphExecEventWaitNodeSetEvent.cc +++ b/projects/hip-tests/catch/unit/graph/hipGraphExecEventWaitNodeSetEvent.cc @@ -47,33 +47,30 @@ Testcase Scenarios : - Pass event record node as input node. */ -#include #include +#include #include -#define GRID_DIM 64 -#define BLK_DIM 256 -#define LEN (GRID_DIM * BLK_DIM) -#define DELAY_IN_MS 2000 - /** * Kernel Functions to perform square and introduce delay in device. */ -static __global__ void sqr_ker_func(int* a, int* b, int clockrate) { - int tx = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; - if (tx < LEN) b[tx] = a[tx]*a[tx]; - uint64_t wait_t = DELAY_IN_MS, - start = clock64()/clockrate, cur; - do { cur = clock64()/clockrate - start;}while (cur < wait_t); +static __global__ void sqr_ker_func(int* a, int* b, size_t N, int clockrate, size_t delayMs) { + int tx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + if (tx < N) b[tx] = a[tx] * a[tx]; + uint64_t wait_t = delayMs, start = clock64() / clockrate, cur; + do { + cur = clock64() / clockrate - start; + } while (cur < wait_t); } -static __global__ void sqr_ker_func_gfx11(int* a, int* b, int clockrate) { +static __global__ void sqr_ker_func_gfx11(int* a, int* b, size_t N, int clockrate, size_t delayMs) { #if HT_AMD - int tx = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; - if (tx < LEN) b[tx] = a[tx]*a[tx]; - uint64_t wait_t = DELAY_IN_MS, - start = wall_clock64()/clockrate, cur; - do { cur = wall_clock64()/clockrate - start;}while (cur < wait_t); + int tx = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + if (tx < N) b[tx] = a[tx] * a[tx]; + uint64_t wait_t = delayMs, start = wall_clock64() / clockrate, cur; + do { + cur = wall_clock64() / clockrate - start; + } while (cur < wait_t); #endif } @@ -81,7 +78,10 @@ static __global__ void sqr_ker_func_gfx11(int* a, int* b, int clockrate) { * Scenario 1: Test to validate setting different events in executable graph. */ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") { - size_t memsize = LEN*sizeof(int); + constexpr size_t gridSize = 64; + constexpr size_t blockSize = 256; + constexpr size_t N = gridSize * blockSize; + size_t memsize = N * sizeof(int); hipGraph_t graph1, graph2; HIP_CHECK(hipGraphCreate(&graph1, 0)); HIP_CHECK(hipGraphCreate(&graph2, 0)); @@ -91,8 +91,7 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") { HIP_CHECK(hipEventCreate(&event2)); // Create nodes with event_start and event1_end hipGraphNode_t event_rec; - HIP_CHECK(hipGraphAddEventRecordNode(&event_rec, graph1, nullptr, 0, - event1)); + HIP_CHECK(hipGraphAddEventRecordNode(&event_rec, graph1, nullptr, 0, event1)); int *inp_h, *inp_d, *out_h, *out_d; // Allocate host buffers inp_h = reinterpret_cast(malloc(memsize)); @@ -103,7 +102,7 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") { HIP_CHECK(hipMalloc(&inp_d, memsize)); HIP_CHECK(hipMalloc(&out_d, memsize)); // Initialize host buffer - for (uint32_t i = 0; i < LEN; i++) { + for (uint32_t i = 0; i < N; i++) { inp_h[i] = i; out_h[i] = 0; } @@ -112,10 +111,12 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") { // MemcpyH2D -> kernel1 -> event_rec hipGraphNode_t memcpyH2D, kernelnode1; hipKernelNodeParams kernelNodeParams1{}; - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d, - inp_h, memsize, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph1, nullptr, 0, inp_d, inp_h, memsize, + hipMemcpyHostToDevice)); // Get device clock rate int clkRate = 0; + size_t NElem{N}; + size_t delayMs{2000}; if (IsGfx11()) { HIPCHECK(hipDeviceGetAttribute(&clkRate, hipDeviceAttributeWallClockRate, 0)); } else { @@ -123,29 +124,25 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") { } // kernel1 auto sqr_ker_func_used = IsGfx11() ? sqr_ker_func_gfx11 : sqr_ker_func; - void* kernelArgs[] = {&inp_d, &out_d, reinterpret_cast(&clkRate)}; - kernelNodeParams1.func = reinterpret_cast(sqr_ker_func_used); - kernelNodeParams1.gridDim = dim3(GRID_DIM); - kernelNodeParams1.blockDim = dim3(BLK_DIM); + void* kernelArgs[] = {&inp_d, &out_d, reinterpret_cast(&NElem), + reinterpret_cast(&clkRate), reinterpret_cast(&delayMs)}; + kernelNodeParams1.func = reinterpret_cast(sqr_ker_func_used); + kernelNodeParams1.gridDim = dim3(gridSize); + kernelNodeParams1.blockDim = dim3(blockSize); kernelNodeParams1.sharedMemBytes = 0; kernelNodeParams1.kernelParams = reinterpret_cast(kernelArgs); kernelNodeParams1.extra = nullptr; - HIP_CHECK(hipGraphAddKernelNode(&kernelnode1, graph1, nullptr, 0, - &kernelNodeParams1)); + HIP_CHECK(hipGraphAddKernelNode(&kernelnode1, graph1, nullptr, 0, &kernelNodeParams1)); // Create dependencies for graph1 - HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D, - &kernelnode1, 1)); - HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode1, - &event_rec, 1)); + HIP_CHECK(hipGraphAddDependencies(graph1, &memcpyH2D, &kernelnode1, 1)); + HIP_CHECK(hipGraphAddDependencies(graph1, &kernelnode1, &event_rec, 1)); // graph2 creation ........... // waitnode(event1) -> MemcpyD2H hipGraphNode_t event_wait_node, memcpyD2H; - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph2, nullptr, 0, - out_h, out_d, memsize, hipMemcpyDeviceToHost)); - HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0, - event1)); - HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node, - &memcpyD2H, 1)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph2, nullptr, 0, out_h, out_d, memsize, + hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddEventWaitNode(&event_wait_node, graph2, nullptr, 0, event1)); + HIP_CHECK(hipGraphAddDependencies(graph2, &event_wait_node, &memcpyD2H, 1)); // Instantiate graph1 and graph2 hipStream_t streamForGraph1, streamForGraph2; hipGraphExec_t graphExec1, graphExec2; @@ -160,8 +157,8 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") { HIP_CHECK(hipStreamSynchronize(streamForGraph2)); // Validate output bool btestPassed = true; - for (uint32_t i = 0; i < LEN; i++) { - if (out_h[i] != (inp_h[i]*inp_h[i])) { + for (uint32_t i = 0; i < N; i++) { + if (out_h[i] != (inp_h[i] * inp_h[i])) { btestPassed = false; break; } @@ -170,10 +167,8 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") { // hipGraphExecEventWaitNodeSetEvent() TEST // Change the event at event_wait_node node to event2 and // the event at event_rec node to event2. - HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec1, - event_rec, event2)); - HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec2, - event_wait_node, event2)); + HIP_CHECK(hipGraphExecEventRecordNodeSetEvent(graphExec1, event_rec, event2)); + HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec2, event_wait_node, event2)); // Launch graph1 and graph2 HIP_CHECK(hipGraphLaunch(graphExec1, streamForGraph1)); HIP_CHECK(hipGraphLaunch(graphExec2, streamForGraph2)); @@ -181,8 +176,8 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_SetAndVerifyMemory") { HIP_CHECK(hipStreamSynchronize(streamForGraph2)); // Validate output btestPassed = true; - for (uint32_t i = 0; i < LEN; i++) { - if (out_h[i] != (inp_h[i]*inp_h[i])) { + for (uint32_t i = 0; i < N; i++) { + if (out_h[i] != (inp_h[i] * inp_h[i])) { btestPassed = false; break; } @@ -214,12 +209,10 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_VerifyEventNotChanged") { HIP_CHECK(hipEventCreate(&event1)); HIP_CHECK(hipEventCreate(&event2)); hipGraphNode_t eventwait; - HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, - event1)); + HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1)); hipGraphExec_t graphExec; HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); - HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec, - eventwait, event2)); + HIP_CHECK(hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, event2)); HIP_CHECK(hipGraphEventWaitNodeGetEvent(eventwait, &event_out)); // validate set event and get event are same REQUIRE(event1 == event_out); @@ -240,13 +233,11 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_Negative") { HIP_CHECK(hipEventCreate(&event1)); HIP_CHECK(hipEventCreate(&event2)); hipGraphNode_t eventrec, eventwait; - HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, - event1)); - HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, - event1)); + HIP_CHECK(hipGraphAddEventRecordNode(&eventrec, graph, nullptr, 0, event1)); + HIP_CHECK(hipGraphAddEventWaitNode(&eventwait, graph, nullptr, 0, event1)); // Create memset constexpr size_t Nbytes = 1024; - char *A_d; + char* A_d; hipGraphNode_t memset_A; hipMemsetParams memsetParams{}; HIP_CHECK(hipMalloc(&A_d, Nbytes)); @@ -262,62 +253,59 @@ TEST_CASE("Unit_hipGraphExecEventWaitNodeSetEvent_Negative") { HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); SECTION("hGraphExec = nullptr") { - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventWaitNodeSetEvent(nullptr, eventwait, event2)); + HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(nullptr, eventwait, event2), + hipErrorInvalidValue); } SECTION("hNode = nullptr") { - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventWaitNodeSetEvent(graphExec, nullptr, event2)); + HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, nullptr, event2), + hipErrorInvalidValue); } SECTION("event = nullptr") { - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, nullptr)); + HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, nullptr), + hipErrorInvalidValue); } SECTION("hGraphExec is uninitialized") { hipGraphExec_t graphExec1{}; - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventWaitNodeSetEvent(graphExec1, eventwait, event2)); + HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec1, eventwait, event2), + hipErrorInvalidValue); } SECTION("hNode is uninitialized") { hipGraphNode_t dummy{}; - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventWaitNodeSetEvent(graphExec, dummy, event2)); + HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, dummy, event2), + hipErrorInvalidValue); } SECTION("event is uninitialized") { hipEvent_t event_dummy{}; - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, - event_dummy)); + HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, eventwait, event_dummy), + hipErrorInvalidValue); } SECTION("event wait node does not exist") { hipGraph_t graph1; HIP_CHECK(hipGraphCreate(&graph1, 0)); - HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph1, nullptr, 0, - &memsetParams)); + HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph1, nullptr, 0, &memsetParams)); hipGraphExec_t graphExec1; HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventWaitNodeSetEvent(graphExec1, eventwait, event2)); + HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec1, eventwait, event2), + hipErrorInvalidValue); HIP_CHECK(hipGraphExecDestroy(graphExec1)); HIP_CHECK(hipGraphDestroy(graph1)); } SECTION("pass memset node as hNode") { - HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, - &memsetParams)); - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventWaitNodeSetEvent(graphExec, memset_A, event2)); + HIP_CHECK(hipGraphAddMemsetNode(&memset_A, graph, nullptr, 0, &memsetParams)); + HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, memset_A, event2), + hipErrorInvalidValue); } SECTION("pass event record node as hNode") { - REQUIRE(hipErrorInvalidValue == - hipGraphExecEventWaitNodeSetEvent(graphExec, eventrec, event2)); + HIP_CHECK_ERROR(hipGraphExecEventWaitNodeSetEvent(graphExec, eventrec, event2), + hipErrorInvalidValue); } HIP_CHECK(hipFree(A_d)); diff --git a/projects/hip-tests/catch/unit/graph/hipGraphExecUpdate.cc b/projects/hip-tests/catch/unit/graph/hipGraphExecUpdate.cc index ecd7a19f40..bfc7b2c43e 100644 --- a/projects/hip-tests/catch/unit/graph/hipGraphExecUpdate.cc +++ b/projects/hip-tests/catch/unit/graph/hipGraphExecUpdate.cc @@ -27,22 +27,6 @@ THE SOFTWARE. * and perform the update if possible. */ -/** -Testcase Scenarios : -Functional- -1) Make a clone of the created graph and update the executable-graph from a clone or same graph again. -2) Update the executable-graph from a graph and make sure they are taking effect. -Negative- -1) When Pass hGraphExec as nullptr and verify api returns error code. -2) When Pass hGraph as nullptr and verify api returns error code. -3) When Pass hErrorNode_out as nullptr and verify api returns error code. -4) When Pass updateResult_out as nullptr and verify api returns error code. -5) When the a graphExec was updated with with different type of node and verify api returns error code. -6) When a node is deleted in hGraph but not its pair from hGraphExec and verify api returns error code. -7) When a node is deleted in hGraphExec but not its pair from hGraph and verify api returns error code. -8) When grpah dependencies differ but graph have same node and verify api returns error code. -*/ - #include #include #include @@ -65,13 +49,11 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Basic") { hipGraphNode_t hErrorNode_out{}; hipGraphExecUpdateResult updateResult_out{}; SECTION("Pass hGraphExec as nullptr") { - ret = hipGraphExecUpdate(nullptr, graph, &hErrorNode_out, - &updateResult_out); + ret = hipGraphExecUpdate(nullptr, graph, &hErrorNode_out, &updateResult_out); REQUIRE(hipErrorInvalidValue == ret); } SECTION("Pass hGraph as nullptr") { - ret = hipGraphExecUpdate(graphExec, nullptr, &hErrorNode_out, - &updateResult_out); + ret = hipGraphExecUpdate(graphExec, nullptr, &hErrorNode_out, &updateResult_out); REQUIRE(hipErrorInvalidValue == ret); } SECTION("Pass hErrorNode_out as nullptr") { @@ -101,10 +83,9 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_TypeChange") { constexpr size_t N = 1024; constexpr size_t Nbytes = N * sizeof(char); constexpr size_t val = 0; - char *devData; + char* devData; int *A_d, *A_h; - HipTest::initArrays(&A_d, nullptr, nullptr, - &A_h, nullptr, nullptr, N, false); + HipTest::initArrays(&A_d, nullptr, nullptr, &A_h, nullptr, nullptr, N, false); HIP_CHECK(hipMalloc(&devData, Nbytes)); hipGraph_t graph, graph2; hipGraphExec_t graphExec; @@ -122,18 +103,16 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_TypeChange") { memsetParams.elementSize = sizeof(char); memsetParams.width = Nbytes; memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0, - &memsetParams)); + HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0, &memsetParams)); std::vector dependencies; dependencies.push_back(memsetNode); HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); HIP_CHECK(hipGraphCreate(&graph2, 0)); HIP_CHECK(hipStreamCreate(&streamForGraph)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); // graphExec was created before memcpyTemp was added to graph. - ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, - &updateResult_out); + ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out); REQUIRE(hipGraphExecUpdateErrorNodeTypeChanged == updateResult_out); REQUIRE(hipErrorGraphExecUpdateFailure == ret); HIP_CHECK(hipFree(devData)); @@ -164,7 +143,7 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_CountDiffer") { int *A_d, *B_d, *C_d; int *A_h, *B_h, *C_h; size_t NElem{N}; - int *hData = reinterpret_cast(malloc(Nbytes)); + int* hData = reinterpret_cast(malloc(Nbytes)); REQUIRE(hData != nullptr); memset(hData, 0, Nbytes); hipGraphNode_t memcpy_A, memcpy_B, memcpy_C, memcpyTemp; @@ -180,57 +159,52 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_CountDiffer") { unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); HIP_CHECK(hipGraphCreate(&graph1, 0)); HIP_CHECK(hipStreamCreate(&streamForGraph)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, - Nbytes, hipMemcpyDeviceToHost)); - void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; - kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, Nbytes, + hipMemcpyDeviceToHost)); + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); kernelNodeParams.gridDim = dim3(blocks); kernelNodeParams.blockDim = dim3(threadsPerBlock); kernelNodeParams.sharedMemBytes = 0; kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); kernelNodeParams.extra = nullptr; - HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph1, nullptr, 0, - &kernelNodeParams)); + HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph1, nullptr, 0, &kernelNodeParams)); // Create dependencies HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &kernel_vecAdd, 1)); HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &kernel_vecAdd, 1)); HIP_CHECK(hipGraphAddDependencies(graph1, &kernel_vecAdd, &memcpy_C, 1)); // Create a cloned graph and added extra node to it HIP_CHECK(hipGraphClone(&graph2, graph1)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyTemp, graph2, nullptr, 0, - C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyTemp, graph2, nullptr, 0, C_h, C_d, Nbytes, + hipMemcpyDeviceToHost)); HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0)); SECTION("When a node deleted from Graph but not from its pair GraphExec") { - ret = hipGraphExecUpdate(graphExec2, graph1, &hErrorNode_out, - &updateResult_out); + ret = hipGraphExecUpdate(graphExec2, graph1, &hErrorNode_out, &updateResult_out); REQUIRE(hipErrorGraphExecUpdateFailure == ret); } SECTION("When a node deleted from GraphExec but not from its pair Graph") { - ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out, - &updateResult_out); + ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out, &updateResult_out); REQUIRE(hipErrorGraphExecUpdateFailure == ret); } SECTION("When the dependent nodes of a pair differ") { HIP_CHECK(hipGraphCreate(&graph3, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph3, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_d, B_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph3, nullptr, 0, C_h, C_d, - Nbytes, hipMemcpyDeviceToHost)); - HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph3, nullptr, 0, - &kernelNodeParams)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph3, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_d, B_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph3, nullptr, 0, C_h, C_d, Nbytes, + hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph3, nullptr, 0, &kernelNodeParams)); // Create dependencies HIP_CHECK(hipGraphAddDependencies(graph3, &memcpy_A, &kernel_vecAdd, 1)); HIP_CHECK(hipGraphAddDependencies(graph3, &memcpy_B, &kernel_vecAdd, 1)); HIP_CHECK(hipGraphAddDependencies(graph3, &memcpy_C, &kernel_vecAdd, 1)); - ret = hipGraphExecUpdate(graphExec1, graph3, &hErrorNode_out, - &updateResult_out); + ret = hipGraphExecUpdate(graphExec1, graph3, &hErrorNode_out, &updateResult_out); REQUIRE(hipErrorGraphExecUpdateFailure == ret); HIP_CHECK(hipGraphDestroy(graph3)); } @@ -265,7 +239,7 @@ TEST_CASE("Unit_hipGraphExecUpdate_Functional") { int *A_d, *B_d, *C_d; int *A_h, *B_h, *C_h; size_t NElem{N}; - int *hData = reinterpret_cast(malloc(Nbytes)); + int* hData = reinterpret_cast(malloc(Nbytes)); REQUIRE(hData != nullptr); memset(hData, 0, Nbytes); hipGraphNode_t memcpy_A, memcpy_B, memcpy_C; @@ -280,22 +254,20 @@ TEST_CASE("Unit_hipGraphExecUpdate_Functional") { unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); HIP_CHECK(hipGraphCreate(&graph, 0)); HIP_CHECK(hipStreamCreate(&streamForGraph)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph, nullptr, 0, B_d, B_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph, nullptr, 0, C_h, C_d, - Nbytes, hipMemcpyDeviceToHost)); - void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; - kernelNodeParams.func = - reinterpret_cast(HipTest::vector_square); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph, nullptr, 0, B_d, B_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph, nullptr, 0, C_h, C_d, Nbytes, + hipMemcpyDeviceToHost)); + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vector_square); kernelNodeParams.gridDim = dim3(blocks); kernelNodeParams.blockDim = dim3(threadsPerBlock); kernelNodeParams.sharedMemBytes = 0; kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); kernelNodeParams.extra = nullptr; - HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSquare, graph, nullptr, 0, - &kernelNodeParams)); + HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSquare, graph, nullptr, 0, &kernelNodeParams)); // Create dependencies HIP_CHECK(hipGraphAddDependencies(graph, &memcpy_A, &kernel_vecSquare, 1)); HIP_CHECK(hipGraphAddDependencies(graph, &memcpy_B, &kernel_vecSquare, 1)); @@ -304,36 +276,32 @@ TEST_CASE("Unit_hipGraphExecUpdate_Functional") { HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); SECTION("Update graphExec with clone graph") { HIP_CHECK(hipGraphClone(&clonedgraph, graph)); - HIP_CHECK(hipGraphExecUpdate(graphExec, clonedgraph, &hErrorNode_out, - &updateResult_out)); + HIP_CHECK(hipGraphExecUpdate(graphExec, clonedgraph, &hErrorNode_out, &updateResult_out)); } // Code for new graph creation with samilar node setup HIP_CHECK(hipGraphCreate(&graph2, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, - Nbytes, hipMemcpyDeviceToHost)); - HIP_CHECK(hipGraphMemcpyNodeSetParams1D(memcpy_C, hData, C_d, Nbytes, - hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, Nbytes, + hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphMemcpyNodeSetParams1D(memcpy_C, hData, C_d, Nbytes, hipMemcpyDeviceToHost)); memset(&kernelNodeParams, 0, sizeof(hipKernelNodeParams)); - void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; - kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + void* kernelArgs2[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); kernelNodeParams.gridDim = dim3(blocks); kernelNodeParams.blockDim = dim3(threadsPerBlock); kernelNodeParams.sharedMemBytes = 0; kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs2); kernelNodeParams.extra = nullptr; - HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph2, nullptr, 0, - &kernelNodeParams)); + HIP_CHECK(hipGraphAddKernelNode(&kernel_vecAdd, graph2, nullptr, 0, &kernelNodeParams)); // Create dependencies HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &kernel_vecAdd, 1)); HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B, &kernel_vecAdd, 1)); HIP_CHECK(hipGraphAddDependencies(graph2, &kernel_vecAdd, &memcpy_C, 1)); // Update the graphExec graph from graph -> graph2 - HIP_CHECK(hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, - &updateResult_out)); + HIP_CHECK(hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out)); REQUIRE(updateResult_out == hipGraphExecUpdateSuccess); HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph)); HIP_CHECK(hipStreamSynchronize(streamForGraph)); @@ -380,24 +348,22 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_ParametersChanged") { hipGraphExecUpdateResult updateResult_out; HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); HIP_CHECK(hipGraphCreate(&graph1, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); SECTION("Update graphExec with similar graph and verify") { HIP_CHECK(hipGraphCreate(&graph2, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, - Nbytes, hipMemcpyHostToDevice)); - ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out, - &updateResult_out); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes, + hipMemcpyHostToDevice)); + ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out, &updateResult_out); REQUIRE(hipSuccess == ret); HIP_CHECK(hipGraphDestroy(graph2)); } SECTION("Update graphExec with similar graph and verify") { HIP_CHECK(hipGraphCreate(&graph3, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_h, B_d, - Nbytes, hipMemcpyDeviceToHost)); - ret = hipGraphExecUpdate(graphExec1, graph3, &hErrorNode_out, - &updateResult_out); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_h, B_d, Nbytes, + hipMemcpyDeviceToHost)); + ret = hipGraphExecUpdate(graphExec1, graph3, &hErrorNode_out, &updateResult_out); REQUIRE(hipErrorGraphExecUpdateFailure == ret); REQUIRE(hipGraphExecUpdateErrorParametersChanged == updateResult_out); @@ -437,16 +403,15 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_CountDiffer_1") { HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); HIP_CHECK(hipGraphCreate(&graph1, 0)); HIP_CHECK(hipGraphCreate(&graph2, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, - Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes, + hipMemcpyHostToDevice)); HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); // When count of nodes directly differ in graphExec1 and graph2 - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, - Nbytes, hipMemcpyDeviceToHost)); - ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out, - &updateResult_out); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, Nbytes, + hipMemcpyDeviceToHost)); + ret = hipGraphExecUpdate(graphExec1, graph2, &hErrorNode_out, &updateResult_out); REQUIRE(hipErrorGraphExecUpdateFailure == ret); #if HT_NVIDIA @@ -495,16 +460,15 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_CountDiffer_2") { hipGraphExecUpdateResult updateResult_out; HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); HIP_CHECK(hipGraphCreate(&graph1, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, - Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes, + hipMemcpyHostToDevice)); HIP_CHECK(hipGraphInstantiate(&graphExec1, graph1, nullptr, nullptr, 0)); // Delete a node from the graph HIP_CHECK(hipGraphDestroyNode(memcpy_B)); SECTION("When a node deleted from Graph but not from its pair GraphExec") { - ret = hipGraphExecUpdate(graphExec1, graph1, &hErrorNode_out, - &updateResult_out); + ret = hipGraphExecUpdate(graphExec1, graph1, &hErrorNode_out, &updateResult_out); REQUIRE(hipErrorGraphExecUpdateFailure == ret); REQUIRE(hipGraphExecUpdateErrorTopologyChanged == updateResult_out); #if HT_NVIDIA @@ -513,11 +477,10 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_CountDiffer_2") { } SECTION("Update the GraphExec with similar graph where a node get deleted") { HIP_CHECK(hipGraphCreate(&graph2, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_d, C_h, - Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_d, C_h, Nbytes, + hipMemcpyHostToDevice)); HIP_CHECK(hipGraphInstantiate(&graphExec2, graph2, nullptr, nullptr, 0)); - ret = hipGraphExecUpdate(graphExec2, graph1, &hErrorNode_out, - &updateResult_out); + ret = hipGraphExecUpdate(graphExec2, graph1, &hErrorNode_out, &updateResult_out); #if HT_NVIDIA REQUIRE(hipErrorGraphExecUpdateFailure == ret); REQUIRE(hipGraphExecUpdateErrorNotSupported == updateResult_out); @@ -529,13 +492,12 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Functional_CountDiffer_2") { } SECTION("When A node is deleted in GraphExec but not its pair from Graph") { HIP_CHECK(hipGraphCreate(&graph3, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph3, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph3, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); HIP_CHECK(hipGraphInstantiate(&graphExec3, graph3, nullptr, nullptr, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_d, B_h, - Nbytes, hipMemcpyHostToDevice)); - ret = hipGraphExecUpdate(graphExec3, graph3, &hErrorNode_out, - &updateResult_out); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph3, nullptr, 0, B_d, B_h, Nbytes, + hipMemcpyHostToDevice)); + ret = hipGraphExecUpdate(graphExec3, graph3, &hErrorNode_out, &updateResult_out); REQUIRE(hipErrorGraphExecUpdateFailure == ret); #if HT_NVIDIA REQUIRE(hipGraphExecUpdateErrorNotSupported == updateResult_out); @@ -581,27 +543,26 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_Dependent_NodesDiffer") { hipGraphExecUpdateResult updateResult_out; HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); HIP_CHECK(hipGraphCreate(&graph1, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_d, C_h, - Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_d, C_h, Nbytes, + hipMemcpyHostToDevice)); HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &memcpy_B, 1)); HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &memcpy_C, 1)); HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0)); HIP_CHECK(hipGraphCreate(&graph2, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_d, C_h, - Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_d, C_h, Nbytes, + hipMemcpyHostToDevice)); HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &memcpy_C, 1)); HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B, &memcpy_C, 1)); - ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, - &updateResult_out); + ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out); REQUIRE(hipErrorGraphExecUpdateFailure == ret); REQUIRE(hipGraphExecUpdateErrorTopologyChanged == updateResult_out); @@ -642,10 +603,10 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_NodeType_Changed") { HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); HIP_CHECK(hipGraphCreate(&graph1, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, - Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes, + hipMemcpyHostToDevice)); HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &memcpy_B, 1)); HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0)); @@ -658,13 +619,11 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_NodeType_Changed") { memsetParams.elementSize = sizeof(char); memsetParams.width = Nbytes; memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph2, nullptr, 0, - &memsetParams)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph2, nullptr, 0, &memsetParams)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &memsetNode, 1)); - ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, - &updateResult_out); + ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out); REQUIRE(hipErrorGraphExecUpdateFailure == ret); #if HT_NVIDIA REQUIRE(hipGraphExecUpdateErrorTopologyChanged == updateResult_out); @@ -726,22 +685,21 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_MultiDevice_Context_Changed") { hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); HIP_CHECK(hipGraphCreate(&graph1, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, - Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, Nbytes, + hipMemcpyDeviceToHost)); hipKernelNodeParams kernelNodeParams{}; - void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; - kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); kernelNodeParams.gridDim = dim3(blocks); kernelNodeParams.blockDim = dim3(threadsPerBlock); kernelNodeParams.sharedMemBytes = 0; kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); kernelNodeParams.extra = nullptr; - HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0, - &kernelNodeParams)); + HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0, &kernelNodeParams)); HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &kernel_vecADD, 1)); HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &kernel_vecADD, 1)); HIP_CHECK(hipGraphAddDependencies(graph1, &kernel_vecADD, &memcpy_C, 1)); @@ -750,27 +708,25 @@ TEST_CASE("Unit_hipGraphExecUpdate_Negative_MultiDevice_Context_Changed") { HIP_CHECK(hipSetDevice(1)); HIP_CHECK(hipGraphCreate(&graph2, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, - Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, Nbytes, + hipMemcpyDeviceToHost)); memset(&kernelNodeParams, 0x00, sizeof(hipKernelNodeParams)); - void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; - kernelNodeParams.func = reinterpret_cast(HipTest::vectorSUB); + void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorSUB); kernelNodeParams.gridDim = dim3(blocks); kernelNodeParams.blockDim = dim3(threadsPerBlock); kernelNodeParams.sharedMemBytes = 0; kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs1); kernelNodeParams.extra = nullptr; - HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0, - &kernelNodeParams)); + HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0, &kernelNodeParams)); HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &kernel_vecSUB, 1)); HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B, &kernel_vecSUB, 1)); HIP_CHECK(hipGraphAddDependencies(graph2, &kernel_vecSUB, &memcpy_C, 1)); - ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, - &updateResult_out); + ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out); REQUIRE(hipErrorGraphExecUpdateFailure == ret); REQUIRE(hipGraphExecUpdateErrorUnsupportedFunctionChange == updateResult_out); @@ -819,49 +775,46 @@ TEST_CASE("Unit_hipGraphExecUpdate_Functional_KernelFunction_Changed") { hipStream_t stream; HIP_CHECK(hipStreamCreate(&stream)); HIP_CHECK(hipGraphCreate(&graph1, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, - Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph1, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph1, nullptr, 0, B_d, B_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph1, nullptr, 0, C_h, C_d, Nbytes, + hipMemcpyDeviceToHost)); hipKernelNodeParams kernelNodeParams{}; - void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; - kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); + void* kernelArgs[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorADD); kernelNodeParams.gridDim = dim3(blocks); kernelNodeParams.blockDim = dim3(threadsPerBlock); kernelNodeParams.sharedMemBytes = 0; kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs); kernelNodeParams.extra = nullptr; - HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0, - &kernelNodeParams)); + HIP_CHECK(hipGraphAddKernelNode(&kernel_vecADD, graph1, nullptr, 0, &kernelNodeParams)); HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_A, &kernel_vecADD, 1)); HIP_CHECK(hipGraphAddDependencies(graph1, &memcpy_B, &kernel_vecADD, 1)); HIP_CHECK(hipGraphAddDependencies(graph1, &kernel_vecADD, &memcpy_C, 1)); HIP_CHECK(hipGraphInstantiate(&graphExec, graph1, nullptr, nullptr, 0)); HIP_CHECK(hipGraphCreate(&graph2, 0)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, - Nbytes, hipMemcpyHostToDevice)); - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, - Nbytes, hipMemcpyDeviceToHost)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_A, graph2, nullptr, 0, A_d, A_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_B, graph2, nullptr, 0, B_d, B_h, Nbytes, + hipMemcpyHostToDevice)); + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpy_C, graph2, nullptr, 0, C_h, C_d, Nbytes, + hipMemcpyDeviceToHost)); memset(&kernelNodeParams, 0x00, sizeof(hipKernelNodeParams)); - void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; - kernelNodeParams.func = reinterpret_cast(HipTest::vectorSUB); + void* kernelArgs1[] = {&A_d, &B_d, &C_d, reinterpret_cast(&NElem)}; + kernelNodeParams.func = reinterpret_cast(HipTest::vectorSUB); kernelNodeParams.gridDim = dim3(blocks); kernelNodeParams.blockDim = dim3(threadsPerBlock); kernelNodeParams.sharedMemBytes = 0; kernelNodeParams.kernelParams = reinterpret_cast(kernelArgs1); kernelNodeParams.extra = nullptr; - HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0, - &kernelNodeParams)); + HIP_CHECK(hipGraphAddKernelNode(&kernel_vecSUB, graph2, nullptr, 0, &kernelNodeParams)); HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_A, &kernel_vecSUB, 1)); HIP_CHECK(hipGraphAddDependencies(graph2, &memcpy_B, &kernel_vecSUB, 1)); HIP_CHECK(hipGraphAddDependencies(graph2, &kernel_vecSUB, &memcpy_C, 1)); - ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, - &updateResult_out); + ret = hipGraphExecUpdate(graphExec, graph2, &hErrorNode_out, &updateResult_out); REQUIRE(hipSuccess == ret); HIP_CHECK(hipGraphLaunch(graphExec, stream)); HIP_CHECK(hipStreamSynchronize(stream)); diff --git a/projects/hip-tests/catch/unit/graph/hipGraphLaunch.cc b/projects/hip-tests/catch/unit/graph/hipGraphLaunch.cc index 21afeffce8..e0513a28e0 100644 --- a/projects/hip-tests/catch/unit/graph/hipGraphLaunch.cc +++ b/projects/hip-tests/catch/unit/graph/hipGraphLaunch.cc @@ -19,394 +19,127 @@ THE SOFTWARE. #include #include -#include -/* Test verifies hipGraphLaunch API -Negative scenarios - -1) Pass graphExec as nullptr and verify api returns error code. -2) Pass pGraphExec as nullptr and stream as hipStreamPerThread and verify api returns error code. -3) Pass pGraphExec as empty object and verify api returns error code. -4) Destroy executable graph and try to launch it. Make sure api should not crash and it should returns error code. -5) Destroy stream and try to launch respective executable graph. Make sure api should not crash and it should returns error code. -6) Destroy actual graph created and try to launch respective executable graph. - Check api should execute properly without crash or error code. -Functional Scenario - -1) Check basic functionality with stream as hipStreamPerThread -2) Test hipGraphLaunch call on multiple devices. -3) Create a graph with multiple nodes. Create an executable graph. - Launch the executable graph 3 times in stream simultaneously. - Wait for stream. Validate the output. No issues should be observed -4) Create a graph with multiple nodes. Create an executable graph. - Verify if an executable graph be launched on null stream. -*/ -#define SIZE 1024 -#define TEST_LOOP_SIZE 3 +/** + * @addtogroup hipGraphLaunch hipGraphLaunch + * @{ + * @ingroup GraphTest + * `hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream)` - + * Launches an executable graph in a stream + */ -TEST_CASE("Unit_hipGraphLaunch_Negative") { - hipError_t ret; - SECTION("Pass pGraphExec as nullptr") { - hipStream_t stream{}; +static void HostFunctionSetToZero(void* arg) { + int* test_number = (int*)arg; + (*test_number) = 0; +} + +static void HostFunctionAddOne(void* arg) { + int* test_number = (int*)arg; + (*test_number) += 1; +} + +/* create an executable graph that will set an integer pointed to by 'number' to one*/ +static void CreateTestExecutableGraph(hipGraphExec_t* graph_exec, int* number) { + hipGraph_t graph; + hipGraphNode_t node_error; + + hipGraphNode_t node_set_zero; + hipHostNodeParams params_set_to_zero = {HostFunctionSetToZero, number}; + + hipGraphNode_t node_add_one; + hipHostNodeParams params_set_add_one = {HostFunctionAddOne, number}; + + HIP_CHECK(hipGraphCreate(&graph, 0)); + + HIP_CHECK(hipGraphAddHostNode(&node_set_zero, graph, nullptr, 0, ¶ms_set_to_zero)); + HIP_CHECK(hipGraphAddHostNode(&node_add_one, graph, &node_set_zero, 1, ¶ms_set_add_one)); + + HIP_CHECK(hipGraphInstantiate(graph_exec, graph, &node_error, nullptr, 0)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +static void HipGraphLaunch_Positive_Simple(hipStream_t stream) { + int number = 5; + + hipGraphExec_t graph_exec; + CreateTestExecutableGraph(&graph_exec, &number); + + HIP_CHECK(hipGraphLaunch(graph_exec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + REQUIRE(number == 1); + + HIP_CHECK(hipGraphExecDestroy(graph_exec)); +} + + +/** + * Test Description + * ------------------------ + * - Basic positive test for hipGraphLaunch + * -# stream as a created stream + * -# with stream as hipStreamPerThread + * Test source + * ------------------------ + * - unit/graph/hipGraphLaunch.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_hipGraphLaunch_Positive") { + SECTION("stream as a created stream") { + hipStream_t stream; + HIP_CHECK(hipStreamCreate(&stream)); + HipGraphLaunch_Positive_Simple(stream); + HIP_CHECK(hipStreamDestroy(stream)); + } + + SECTION("with stream as hipStreamPerThread") { + HipGraphLaunch_Positive_Simple(hipStreamPerThread); + } +} + +/** + * Test Description + * ------------------------ + * - Negative parameter test for hipGraphLaunch + * -# graphExec is nullptr and stream is a created stream + * -# graphExec is nullptr and stream is hipStreamPerThread + * -# graphExec is an empty object + * -# graphExec is destroyed before calling hipGraphLaunch + * Test source + * ------------------------ + * - unit/graph/hipGraphLaunch.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEST_CASE("Unit_hipGraphLaunch_Negative_Parameters") { + SECTION("graphExec is nullptr and stream is a created stream") { + hipStream_t stream; + hipError_t ret; + HIP_CHECK(hipStreamCreate(&stream)); ret = hipGraphLaunch(nullptr, stream); - REQUIRE(hipErrorInvalidValue == ret); - } - SECTION("Pass pGraphExec as nullptr and stream as hipStreamPerThread") { - ret = hipGraphLaunch(nullptr, hipStreamPerThread); - REQUIRE(hipErrorInvalidValue == ret); - } - SECTION("Pass pGraphExec as empty object") { - hipGraphExec_t graphExec{}; - hipStream_t stream{}; - ret = hipGraphLaunch(graphExec, stream); - REQUIRE(hipErrorInvalidValue == ret); - } - SECTION("Destroy executable graph and try to launch it") { - constexpr size_t Nbytes = 1024; - hipGraph_t graph; - hipGraphExec_t graphExec; - hipStream_t stream; - hipGraphNode_t memsetNode; - - char *devData; - HIP_CHECK(hipMalloc(&devData, Nbytes)); - - HIP_CHECK(hipGraphCreate(&graph, 0)); - HIP_CHECK(hipStreamCreate(&stream)); - - hipMemsetParams memsetParams{}; - memset(&memsetParams, 0, sizeof(memsetParams)); - memsetParams.dst = reinterpret_cast(devData); - memsetParams.value = 0; - memsetParams.pitch = 0; - memsetParams.elementSize = sizeof(char); - memsetParams.width = Nbytes; - memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0, - &memsetParams)); - HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); - HIP_CHECK(hipGraphLaunch(graphExec, stream)); - HIP_CHECK(hipStreamSynchronize(stream)); - - HIP_CHECK(hipGraphExecDestroy(graphExec)); - // Launch again after destroy graph exec object. - ret = hipGraphLaunch(graphExec, stream); - REQUIRE(hipErrorInvalidValue == ret); - - HIP_CHECK(hipFree(devData)); - HIP_CHECK(hipGraphDestroy(graph)); HIP_CHECK(hipStreamDestroy(stream)); + REQUIRE(ret == hipErrorInvalidValue); } -/* In this case in CUDA setup this api call is giving - unknown error (999) - So enabling this test for both AMD and CUDA by checking with hipSuccess */ - SECTION("Destroy stream and try to launch respective executable graph") { - constexpr size_t Nbytes = 1024; - hipGraph_t graph; - hipGraphExec_t graphExec; - hipStream_t stream; - hipGraphNode_t memsetNode; - char *devData; - HIP_CHECK(hipMalloc(&devData, Nbytes)); - - HIP_CHECK(hipGraphCreate(&graph, 0)); - HIP_CHECK(hipStreamCreate(&stream)); - - hipMemsetParams memsetParams{}; - memset(&memsetParams, 0, sizeof(memsetParams)); - memsetParams.dst = reinterpret_cast(devData); - memsetParams.value = 0; - memsetParams.pitch = 0; - memsetParams.elementSize = sizeof(char); - memsetParams.width = Nbytes; - memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0, - &memsetParams)); - HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); - HIP_CHECK(hipGraphLaunch(graphExec, stream)); - HIP_CHECK(hipStreamSynchronize(stream)); - - HIP_CHECK(hipStreamDestroy(stream)); - // Launch again after destroy stream - ret = hipGraphLaunch(graphExec, stream); - REQUIRE(hipSuccess != ret); - - HIP_CHECK(hipFree(devData)); - HIP_CHECK(hipGraphExecDestroy(graphExec)); - HIP_CHECK(hipGraphDestroy(graph)); + SECTION("graphExec is nullptr and stream is hipStreamPerThread") { + HIP_CHECK_ERROR(hipGraphLaunch(nullptr, hipStreamPerThread), hipErrorInvalidValue); } - SECTION("Destroy graph and try to launch respective executable graph") { - constexpr size_t Nbytes = 1024; - hipGraph_t graph; - hipGraphExec_t graphExec; - hipStream_t stream; - hipGraphNode_t memsetNode; - char *devData; - HIP_CHECK(hipMalloc(&devData, Nbytes)); + SECTION("graphExec is an empty object") { + hipGraphExec_t graph_exec{}; + HIP_CHECK_ERROR(hipGraphLaunch(graph_exec, hipStreamPerThread), hipErrorInvalidValue); + } - HIP_CHECK(hipGraphCreate(&graph, 0)); - HIP_CHECK(hipStreamCreate(&stream)); - - hipMemsetParams memsetParams{}; - memset(&memsetParams, 0, sizeof(memsetParams)); - memsetParams.dst = reinterpret_cast(devData); - memsetParams.value = 0; - memsetParams.pitch = 0; - memsetParams.elementSize = sizeof(char); - memsetParams.width = Nbytes; - memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0, - &memsetParams)); - HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); - HIP_CHECK(hipGraphLaunch(graphExec, stream)); - HIP_CHECK(hipStreamSynchronize(stream)); - - HIP_CHECK(hipGraphDestroy(graph)); - // Launch again after destroy graph - ret = hipGraphLaunch(graphExec, stream); - REQUIRE(hipSuccess == ret); - - HIP_CHECK(hipFree(devData)); - HIP_CHECK(hipGraphExecDestroy(graphExec)); - HIP_CHECK(hipStreamDestroy(stream)); + SECTION("graphExec is destroyed") { + int number = 5; + hipGraphExec_t graph_exec; + CreateTestExecutableGraph(&graph_exec, &number); + HIP_CHECK(hipGraphLaunch(graph_exec, hipStreamPerThread)); + HIP_CHECK(hipStreamSynchronize(hipStreamPerThread)); + REQUIRE(number == 1); + HIP_CHECK(hipGraphExecDestroy(graph_exec)); + HIP_CHECK_ERROR(hipGraphLaunch(graph_exec, hipStreamPerThread), hipErrorInvalidValue); } } - -TEST_CASE("Unit_hipGraphLaunch_Functional_hipStreamPerThread") { - constexpr size_t N = 1024; - constexpr size_t Nbytes = N * sizeof(char); - constexpr size_t val = 0; - constexpr size_t updateVal = 2; - char *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr}; - char *A_h{nullptr}, *B_h{nullptr}; - - HipTest::initArrays(&A_d, &B_d, &C_d, - &A_h, &B_h, nullptr, N, false); - - hipGraph_t graph; - hipGraphExec_t graphExec; - hipGraphNode_t memsetNode; - - HIP_CHECK(hipGraphCreate(&graph, 0)); - - hipMemsetParams memsetParams{}; - memset(&memsetParams, 0, sizeof(memsetParams)); - memsetParams.dst = reinterpret_cast(C_d); - memsetParams.value = val; - memsetParams.pitch = 0; - memsetParams.elementSize = sizeof(char); - memsetParams.width = Nbytes; - memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0, - &memsetParams)); - - std::vector dependencies; - dependencies.push_back(memsetNode); - - memset(&memsetParams, 0, sizeof(memsetParams)); - memsetParams.dst = reinterpret_cast(A_d); - memsetParams.value = updateVal; - memsetParams.pitch = 0; - memsetParams.elementSize = sizeof(char); - memsetParams.width = Nbytes; - memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, dependencies.data(), - dependencies.size(), &memsetParams)); - HIP_CHECK(hipGraphMemsetNodeSetParams(memsetNode, &memsetParams)); - dependencies.push_back(memsetNode); - - HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); - HIP_CHECK(hipGraphLaunch(graphExec, hipStreamPerThread)); - HIP_CHECK(hipStreamSynchronize(hipStreamPerThread)); - - HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost)); - - // Validating the result - for (size_t i = 0; i < Nbytes; i++) { - if (A_h[i] != updateVal) { - WARN("Validation failed at- " << i << " A_h[i] " << A_h[i]); - REQUIRE(false); - } - } - - HipTest::freeArrays(A_d, B_d, C_d, - A_h, B_h, nullptr, false); - HIP_CHECK(hipGraphExecDestroy(graphExec)); - HIP_CHECK(hipGraphDestroy(graph)); -} - -static void hipGraphLaunch_test() { - constexpr size_t N = 1024; - constexpr size_t Nbytes = N * sizeof(char); - constexpr size_t val = 0; - constexpr size_t updateVal = 1; - char *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr}; - char *A_h{nullptr}, *B_h{nullptr}; - - HipTest::initArrays(&A_d, &B_d, &C_d, - &A_h, &B_h, nullptr, N, false); - - hipGraph_t graph; - hipGraphExec_t graphExec; - hipStream_t streamForGraph; - hipGraphNode_t memsetNode; - - HIP_CHECK(hipGraphCreate(&graph, 0)); - HIP_CHECK(hipStreamCreate(&streamForGraph)); - - hipMemsetParams memsetParams{}; - memset(&memsetParams, 0, sizeof(memsetParams)); - memsetParams.dst = reinterpret_cast(C_d); - memsetParams.value = val; - memsetParams.pitch = 0; - memsetParams.elementSize = sizeof(char); - memsetParams.width = Nbytes; - memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0, - &memsetParams)); - - std::vector dependencies; - dependencies.push_back(memsetNode); - - memset(&memsetParams, 0, sizeof(memsetParams)); - memsetParams.dst = reinterpret_cast(A_d); - memsetParams.value = updateVal; - memsetParams.pitch = 0; - memsetParams.elementSize = sizeof(char); - memsetParams.width = Nbytes; - memsetParams.height = 1; - HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, dependencies.data(), - dependencies.size(), &memsetParams)); - HIP_CHECK(hipGraphMemsetNodeSetParams(memsetNode, &memsetParams)); - dependencies.push_back(memsetNode); - - HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); - HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph)); - HIP_CHECK(hipStreamSynchronize(streamForGraph)); - - HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost)); - - // Validating the result - for (size_t i = 0; i < Nbytes; i++) { - if (A_h[i] != updateVal) { - WARN("Validation failed at- " << i << " A_h[i] " << A_h[i]); - REQUIRE(false); - } - } - - HipTest::freeArrays(A_d, B_d, C_d, - A_h, B_h, nullptr, false); - HIP_CHECK(hipGraphExecDestroy(graphExec)); - HIP_CHECK(hipGraphDestroy(graph)); - HIP_CHECK(hipStreamDestroy(streamForGraph)); -} - -TEST_CASE("Unit_hipGraphLaunch_Functional_multidevice_test") { - int numDevices = 0; - HIP_CHECK(hipGetDeviceCount(&numDevices)); - - if (numDevices > 0) { - for (int i = 0; i < numDevices; i++) { - HIP_CHECK(hipSetDevice(i)); - hipGraphLaunch_test(); - } - } else { - SUCCEED("Skipped the testcase as there is no device to test."); - } -} - -// Function to fill input data -static void fillRandInpData(int *A1_h, int *A2_h, size_t N) { - unsigned int seed = time(nullptr); - for (size_t i = 0; i < N; i++) { - A1_h[i] = (HipTest::RAND_R(&seed) & 0xFF); - A2_h[i] = (HipTest::RAND_R(&seed) & 0xFF); - } -} -// Function to validate result -static void validateOutData(int *A1_h, int *A2_h, size_t N) { - for (size_t i = 0; i < N; i++) { - int result = (A1_h[i]*A1_h[i]); - REQUIRE(result == A2_h[i]); - } -} -/* - * 1.Create a graph with multiple nodes. Create an executable graph. - * Launch the executable graph 3 times in stream simultaneously. - * Wait for stream. Validate the output. No issues should be observed - * 2.Create a graph with multiple nodes. Create an executable graph. - * Verify if an executable graph be launched on null stream. -*/ -TEST_CASE("Unit_hipGraphLaunch_Functional_MultipleLaunch") { - size_t memSize = SIZE; - constexpr auto blocksPerCU = 6; // to hide latency - constexpr auto threadsPerBlock = 256; - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, - threadsPerBlock, SIZE); - hipGraph_t graph; - std::vector nodeDependencies; - - HIP_CHECK(hipGraphCreate(&graph, 0)); - int *A_h{nullptr}, *A_d{nullptr}, *C_d{nullptr}, *C_h{nullptr}; - - HipTest::initArrays(&A_d, &C_d, nullptr, - &A_h, &C_h, nullptr, SIZE, false); - - hipGraphNode_t memcpyH2D, memcpyD2H, kernelNode; - - // Create memcpy H2D nodes - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr, - 0, A_d, A_h, (sizeof(int)*SIZE), hipMemcpyHostToDevice)); - nodeDependencies.push_back(memcpyH2D); - // Creating kernel node - hipKernelNodeParams kerNodeParams; - void* kernelArgs[] = {reinterpret_cast(&A_d), - reinterpret_cast(&C_d), - reinterpret_cast(&memSize)}; - kerNodeParams.func = reinterpret_cast(HipTest::vector_square); - kerNodeParams.gridDim = dim3(blocks); - kerNodeParams.blockDim = dim3(threadsPerBlock); - kerNodeParams.sharedMemBytes = 0; - kerNodeParams.kernelParams = reinterpret_cast(kernelArgs); - kerNodeParams.extra = nullptr; - HIP_CHECK(hipGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(), - nodeDependencies.size(), &kerNodeParams)); - nodeDependencies.clear(); - nodeDependencies.push_back(kernelNode); - - // Create memcpy D2H nodes - HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nodeDependencies.data(), - nodeDependencies.size(), C_h, C_d, (sizeof(int)*SIZE), - hipMemcpyDeviceToHost)); - nodeDependencies.clear(); - - // Create executable graph - hipStream_t streamForGraph; - hipGraphExec_t graphExec{nullptr}; - HIP_CHECK(hipStreamCreate(&streamForGraph)); - HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, - nullptr, 0)); - // Execute graph - SECTION("Multiple Graph Launch") { - for (int iter = 0; iter < TEST_LOOP_SIZE; iter++) { - fillRandInpData(A_h, C_h, SIZE); - HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph)); - HIP_CHECK(hipStreamSynchronize(streamForGraph)); - validateOutData(A_h, C_h, SIZE); - } - } - SECTION("Graph launch on Null stream") { - for (int iter = 0; iter < TEST_LOOP_SIZE; iter++) { - fillRandInpData(A_h, C_h, SIZE); - HIP_CHECK(hipGraphLaunch(graphExec, 0)); - HIP_CHECK(hipStreamSynchronize(0)); - validateOutData(A_h, C_h, SIZE); - } - } - - HIP_CHECK(hipGraphDestroy(graph)); - HIP_CHECK(hipGraphExecDestroy(graphExec)); - HIP_CHECK(hipStreamDestroy(streamForGraph)); - - // Free - HipTest::freeArrays(A_d, C_d, nullptr, A_h, C_h, nullptr, false); -} diff --git a/projects/hip-tests/catch/unit/graph/hipGraphLaunch_old.cc b/projects/hip-tests/catch/unit/graph/hipGraphLaunch_old.cc new file mode 100644 index 0000000000..21afeffce8 --- /dev/null +++ b/projects/hip-tests/catch/unit/graph/hipGraphLaunch_old.cc @@ -0,0 +1,412 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +/* Test verifies hipGraphLaunch API +Negative scenarios - +1) Pass graphExec as nullptr and verify api returns error code. +2) Pass pGraphExec as nullptr and stream as hipStreamPerThread and verify api returns error code. +3) Pass pGraphExec as empty object and verify api returns error code. +4) Destroy executable graph and try to launch it. Make sure api should not crash and it should returns error code. +5) Destroy stream and try to launch respective executable graph. Make sure api should not crash and it should returns error code. +6) Destroy actual graph created and try to launch respective executable graph. + Check api should execute properly without crash or error code. +Functional Scenario - +1) Check basic functionality with stream as hipStreamPerThread +2) Test hipGraphLaunch call on multiple devices. +3) Create a graph with multiple nodes. Create an executable graph. + Launch the executable graph 3 times in stream simultaneously. + Wait for stream. Validate the output. No issues should be observed +4) Create a graph with multiple nodes. Create an executable graph. + Verify if an executable graph be launched on null stream. +*/ + +#define SIZE 1024 +#define TEST_LOOP_SIZE 3 + +TEST_CASE("Unit_hipGraphLaunch_Negative") { + hipError_t ret; + SECTION("Pass pGraphExec as nullptr") { + hipStream_t stream{}; + ret = hipGraphLaunch(nullptr, stream); + REQUIRE(hipErrorInvalidValue == ret); + } + SECTION("Pass pGraphExec as nullptr and stream as hipStreamPerThread") { + ret = hipGraphLaunch(nullptr, hipStreamPerThread); + REQUIRE(hipErrorInvalidValue == ret); + } + SECTION("Pass pGraphExec as empty object") { + hipGraphExec_t graphExec{}; + hipStream_t stream{}; + ret = hipGraphLaunch(graphExec, stream); + REQUIRE(hipErrorInvalidValue == ret); + } + SECTION("Destroy executable graph and try to launch it") { + constexpr size_t Nbytes = 1024; + hipGraph_t graph; + hipGraphExec_t graphExec; + hipStream_t stream; + hipGraphNode_t memsetNode; + + char *devData; + HIP_CHECK(hipMalloc(&devData, Nbytes)); + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipStreamCreate(&stream)); + + hipMemsetParams memsetParams{}; + memset(&memsetParams, 0, sizeof(memsetParams)); + memsetParams.dst = reinterpret_cast(devData); + memsetParams.value = 0; + memsetParams.pitch = 0; + memsetParams.elementSize = sizeof(char); + memsetParams.width = Nbytes; + memsetParams.height = 1; + HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0, + &memsetParams)); + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + HIP_CHECK(hipGraphExecDestroy(graphExec)); + // Launch again after destroy graph exec object. + ret = hipGraphLaunch(graphExec, stream); + REQUIRE(hipErrorInvalidValue == ret); + + HIP_CHECK(hipFree(devData)); + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipStreamDestroy(stream)); + } +/* In this case in CUDA setup this api call is giving - unknown error (999) + So enabling this test for both AMD and CUDA by checking with hipSuccess */ + SECTION("Destroy stream and try to launch respective executable graph") { + constexpr size_t Nbytes = 1024; + hipGraph_t graph; + hipGraphExec_t graphExec; + hipStream_t stream; + hipGraphNode_t memsetNode; + + char *devData; + HIP_CHECK(hipMalloc(&devData, Nbytes)); + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipStreamCreate(&stream)); + + hipMemsetParams memsetParams{}; + memset(&memsetParams, 0, sizeof(memsetParams)); + memsetParams.dst = reinterpret_cast(devData); + memsetParams.value = 0; + memsetParams.pitch = 0; + memsetParams.elementSize = sizeof(char); + memsetParams.width = Nbytes; + memsetParams.height = 1; + HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0, + &memsetParams)); + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + HIP_CHECK(hipStreamDestroy(stream)); + // Launch again after destroy stream + ret = hipGraphLaunch(graphExec, stream); + REQUIRE(hipSuccess != ret); + + HIP_CHECK(hipFree(devData)); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); + } + SECTION("Destroy graph and try to launch respective executable graph") { + constexpr size_t Nbytes = 1024; + hipGraph_t graph; + hipGraphExec_t graphExec; + hipStream_t stream; + hipGraphNode_t memsetNode; + + char *devData; + HIP_CHECK(hipMalloc(&devData, Nbytes)); + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipStreamCreate(&stream)); + + hipMemsetParams memsetParams{}; + memset(&memsetParams, 0, sizeof(memsetParams)); + memsetParams.dst = reinterpret_cast(devData); + memsetParams.value = 0; + memsetParams.pitch = 0; + memsetParams.elementSize = sizeof(char); + memsetParams.width = Nbytes; + memsetParams.height = 1; + HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0, + &memsetParams)); + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + + HIP_CHECK(hipGraphDestroy(graph)); + // Launch again after destroy graph + ret = hipGraphLaunch(graphExec, stream); + REQUIRE(hipSuccess == ret); + + HIP_CHECK(hipFree(devData)); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipStreamDestroy(stream)); + } +} + +TEST_CASE("Unit_hipGraphLaunch_Functional_hipStreamPerThread") { + constexpr size_t N = 1024; + constexpr size_t Nbytes = N * sizeof(char); + constexpr size_t val = 0; + constexpr size_t updateVal = 2; + char *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr}; + char *A_h{nullptr}, *B_h{nullptr}; + + HipTest::initArrays(&A_d, &B_d, &C_d, + &A_h, &B_h, nullptr, N, false); + + hipGraph_t graph; + hipGraphExec_t graphExec; + hipGraphNode_t memsetNode; + + HIP_CHECK(hipGraphCreate(&graph, 0)); + + hipMemsetParams memsetParams{}; + memset(&memsetParams, 0, sizeof(memsetParams)); + memsetParams.dst = reinterpret_cast(C_d); + memsetParams.value = val; + memsetParams.pitch = 0; + memsetParams.elementSize = sizeof(char); + memsetParams.width = Nbytes; + memsetParams.height = 1; + HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0, + &memsetParams)); + + std::vector dependencies; + dependencies.push_back(memsetNode); + + memset(&memsetParams, 0, sizeof(memsetParams)); + memsetParams.dst = reinterpret_cast(A_d); + memsetParams.value = updateVal; + memsetParams.pitch = 0; + memsetParams.elementSize = sizeof(char); + memsetParams.width = Nbytes; + memsetParams.height = 1; + HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, dependencies.data(), + dependencies.size(), &memsetParams)); + HIP_CHECK(hipGraphMemsetNodeSetParams(memsetNode, &memsetParams)); + dependencies.push_back(memsetNode); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, hipStreamPerThread)); + HIP_CHECK(hipStreamSynchronize(hipStreamPerThread)); + + HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost)); + + // Validating the result + for (size_t i = 0; i < Nbytes; i++) { + if (A_h[i] != updateVal) { + WARN("Validation failed at- " << i << " A_h[i] " << A_h[i]); + REQUIRE(false); + } + } + + HipTest::freeArrays(A_d, B_d, C_d, + A_h, B_h, nullptr, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); +} + +static void hipGraphLaunch_test() { + constexpr size_t N = 1024; + constexpr size_t Nbytes = N * sizeof(char); + constexpr size_t val = 0; + constexpr size_t updateVal = 1; + char *A_d{nullptr}, *B_d{nullptr}, *C_d{nullptr}; + char *A_h{nullptr}, *B_h{nullptr}; + + HipTest::initArrays(&A_d, &B_d, &C_d, + &A_h, &B_h, nullptr, N, false); + + hipGraph_t graph; + hipGraphExec_t graphExec; + hipStream_t streamForGraph; + hipGraphNode_t memsetNode; + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipStreamCreate(&streamForGraph)); + + hipMemsetParams memsetParams{}; + memset(&memsetParams, 0, sizeof(memsetParams)); + memsetParams.dst = reinterpret_cast(C_d); + memsetParams.value = val; + memsetParams.pitch = 0; + memsetParams.elementSize = sizeof(char); + memsetParams.width = Nbytes; + memsetParams.height = 1; + HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, nullptr, 0, + &memsetParams)); + + std::vector dependencies; + dependencies.push_back(memsetNode); + + memset(&memsetParams, 0, sizeof(memsetParams)); + memsetParams.dst = reinterpret_cast(A_d); + memsetParams.value = updateVal; + memsetParams.pitch = 0; + memsetParams.elementSize = sizeof(char); + memsetParams.width = Nbytes; + memsetParams.height = 1; + HIP_CHECK(hipGraphAddMemsetNode(&memsetNode, graph, dependencies.data(), + dependencies.size(), &memsetParams)); + HIP_CHECK(hipGraphMemsetNodeSetParams(memsetNode, &memsetParams)); + dependencies.push_back(memsetNode); + + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); + HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph)); + HIP_CHECK(hipStreamSynchronize(streamForGraph)); + + HIP_CHECK(hipMemcpy(A_h, A_d, Nbytes, hipMemcpyDeviceToHost)); + + // Validating the result + for (size_t i = 0; i < Nbytes; i++) { + if (A_h[i] != updateVal) { + WARN("Validation failed at- " << i << " A_h[i] " << A_h[i]); + REQUIRE(false); + } + } + + HipTest::freeArrays(A_d, B_d, C_d, + A_h, B_h, nullptr, false); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipStreamDestroy(streamForGraph)); +} + +TEST_CASE("Unit_hipGraphLaunch_Functional_multidevice_test") { + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + + if (numDevices > 0) { + for (int i = 0; i < numDevices; i++) { + HIP_CHECK(hipSetDevice(i)); + hipGraphLaunch_test(); + } + } else { + SUCCEED("Skipped the testcase as there is no device to test."); + } +} + +// Function to fill input data +static void fillRandInpData(int *A1_h, int *A2_h, size_t N) { + unsigned int seed = time(nullptr); + for (size_t i = 0; i < N; i++) { + A1_h[i] = (HipTest::RAND_R(&seed) & 0xFF); + A2_h[i] = (HipTest::RAND_R(&seed) & 0xFF); + } +} +// Function to validate result +static void validateOutData(int *A1_h, int *A2_h, size_t N) { + for (size_t i = 0; i < N; i++) { + int result = (A1_h[i]*A1_h[i]); + REQUIRE(result == A2_h[i]); + } +} +/* + * 1.Create a graph with multiple nodes. Create an executable graph. + * Launch the executable graph 3 times in stream simultaneously. + * Wait for stream. Validate the output. No issues should be observed + * 2.Create a graph with multiple nodes. Create an executable graph. + * Verify if an executable graph be launched on null stream. +*/ +TEST_CASE("Unit_hipGraphLaunch_Functional_MultipleLaunch") { + size_t memSize = SIZE; + constexpr auto blocksPerCU = 6; // to hide latency + constexpr auto threadsPerBlock = 256; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, + threadsPerBlock, SIZE); + hipGraph_t graph; + std::vector nodeDependencies; + + HIP_CHECK(hipGraphCreate(&graph, 0)); + int *A_h{nullptr}, *A_d{nullptr}, *C_d{nullptr}, *C_h{nullptr}; + + HipTest::initArrays(&A_d, &C_d, nullptr, + &A_h, &C_h, nullptr, SIZE, false); + + hipGraphNode_t memcpyH2D, memcpyD2H, kernelNode; + + // Create memcpy H2D nodes + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyH2D, graph, nullptr, + 0, A_d, A_h, (sizeof(int)*SIZE), hipMemcpyHostToDevice)); + nodeDependencies.push_back(memcpyH2D); + // Creating kernel node + hipKernelNodeParams kerNodeParams; + void* kernelArgs[] = {reinterpret_cast(&A_d), + reinterpret_cast(&C_d), + reinterpret_cast(&memSize)}; + kerNodeParams.func = reinterpret_cast(HipTest::vector_square); + kerNodeParams.gridDim = dim3(blocks); + kerNodeParams.blockDim = dim3(threadsPerBlock); + kerNodeParams.sharedMemBytes = 0; + kerNodeParams.kernelParams = reinterpret_cast(kernelArgs); + kerNodeParams.extra = nullptr; + HIP_CHECK(hipGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(), + nodeDependencies.size(), &kerNodeParams)); + nodeDependencies.clear(); + nodeDependencies.push_back(kernelNode); + + // Create memcpy D2H nodes + HIP_CHECK(hipGraphAddMemcpyNode1D(&memcpyD2H, graph, nodeDependencies.data(), + nodeDependencies.size(), C_h, C_d, (sizeof(int)*SIZE), + hipMemcpyDeviceToHost)); + nodeDependencies.clear(); + + // Create executable graph + hipStream_t streamForGraph; + hipGraphExec_t graphExec{nullptr}; + HIP_CHECK(hipStreamCreate(&streamForGraph)); + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, + nullptr, 0)); + // Execute graph + SECTION("Multiple Graph Launch") { + for (int iter = 0; iter < TEST_LOOP_SIZE; iter++) { + fillRandInpData(A_h, C_h, SIZE); + HIP_CHECK(hipGraphLaunch(graphExec, streamForGraph)); + HIP_CHECK(hipStreamSynchronize(streamForGraph)); + validateOutData(A_h, C_h, SIZE); + } + } + SECTION("Graph launch on Null stream") { + for (int iter = 0; iter < TEST_LOOP_SIZE; iter++) { + fillRandInpData(A_h, C_h, SIZE); + HIP_CHECK(hipGraphLaunch(graphExec, 0)); + HIP_CHECK(hipStreamSynchronize(0)); + validateOutData(A_h, C_h, SIZE); + } + } + + HIP_CHECK(hipGraphDestroy(graph)); + HIP_CHECK(hipGraphExecDestroy(graphExec)); + HIP_CHECK(hipStreamDestroy(streamForGraph)); + + // Free + HipTest::freeArrays(A_d, C_d, nullptr, A_h, C_h, nullptr, false); +} diff --git a/projects/hip-tests/catch/unit/graph/hipGraphUpload.cc b/projects/hip-tests/catch/unit/graph/hipGraphUpload.cc index 53b22e7e6f..f4db37c05f 100644 --- a/projects/hip-tests/catch/unit/graph/hipGraphUpload.cc +++ b/projects/hip-tests/catch/unit/graph/hipGraphUpload.cc @@ -261,9 +261,10 @@ TEST_CASE("Unit_hipGraphUpload_Functional_With_Priority_Stream") { 1) Pass graphExec node as nullptr. 2) Pass graphExec node as uninitialize object 3) Pass stream as uninitialize object +4) Graphexec is destroyed before upload */ -TEST_CASE("Unit_hipGraphUpload_Negative_Argument_Check") { +TEST_CASE("Unit_hipGraphUpload_Negative_Parameters") { hipGraphExec_t graphExec{}; hipError_t ret; @@ -271,21 +272,30 @@ TEST_CASE("Unit_hipGraphUpload_Negative_Argument_Check") { HIP_CHECK(hipStreamCreate(&stream)); SECTION("Pass graphExec node as nullptr") { - ret = hipGraphUpload(nullptr, stream); - REQUIRE(hipErrorInvalidValue == ret); + HIP_CHECK_ERROR(hipGraphUpload(nullptr, stream), hipErrorInvalidValue); } SECTION("Pass graphExec node as uninitialize object") { - ret = hipGraphUpload(graphExec, stream); - REQUIRE(hipErrorInvalidValue == ret); + HIP_CHECK_ERROR(hipGraphUpload(graphExec, stream), hipErrorInvalidValue); } SECTION("Pass stream as uninitialize object") { hipStream_t stream1{}; hipGraph_t graph; HIP_CHECK(hipGraphCreate(&graph, 0)); - HIP_CHECK(hipGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + HIP_CHECK(hipGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0)); ret = hipGraphUpload(graphExec, stream1); REQUIRE(hipSuccess == ret); } + SECTION("graphExec is destroyed"){ + hipGraphExec_t graph_exec; + hipGraph_t graph; + + HIP_CHECK(hipGraphCreate(&graph, 0)); + HIP_CHECK(hipGraphInstantiate(&graph_exec, graph, nullptr, nullptr, 0)); + + HIP_CHECK(hipGraphUpload(graph_exec, hipStreamPerThread)); + HIP_CHECK(hipGraphExecDestroy(graph_exec)); + HIP_CHECK_ERROR(hipGraphUpload(graph_exec, hipStreamPerThread), hipErrorInvalidValue); + } HIP_CHECK(hipStreamDestroy(stream)); } diff --git a/projects/hip-tests/catch/unit/occupancy/CMakeLists.txt b/projects/hip-tests/catch/unit/occupancy/CMakeLists.txt index 2ad7eb5a6d..fbfd0dd90e 100644 --- a/projects/hip-tests/catch/unit/occupancy/CMakeLists.txt +++ b/projects/hip-tests/catch/unit/occupancy/CMakeLists.txt @@ -4,9 +4,23 @@ set(TEST_SRC hipOccupancyMaxActiveBlocksPerMultiprocessor_old.cc hipOccupancyMaxPotentialBlockSize.cc hipOccupancyMaxPotentialBlockSize_old.cc + hipModuleOccupancyMaxPotentialBlockSize.cc + hipModuleOccupancyMaxPotentialBlockSizeWithFlags.cc + hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.cc + hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.cc hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags.cc ) +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/simple_kernel.code + COMMAND ${CMAKE_CXX_COMPILER} --genco --std=c++17 + ${CMAKE_CURRENT_SOURCE_DIR}/simple_kernel.cc + -o simple_kernel.code --rocm-path=${ROCM_PATH} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/simple_kernel.cc) + +add_custom_target(simple_kernel ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/simple_kernel.code) + hip_add_exe_to_target(NAME OccupancyTest TEST_SRC ${TEST_SRC} TEST_TARGET_NAME build_tests) + +add_dependencies(OccupancyTest simple_kernel) diff --git a/projects/hip-tests/catch/unit/occupancy/hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.cc b/projects/hip-tests/catch/unit/occupancy/hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.cc new file mode 100644 index 0000000000..65e1b42ab3 --- /dev/null +++ b/projects/hip-tests/catch/unit/occupancy/hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.cc @@ -0,0 +1,92 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +/* +Testcase Scenarios : +Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Positive_RangeValidation - Test correct +execution of hipModuleOccupancyMaxActiveBlocksPerMultiprocessor for diffrent parameter values +Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters - Test unsuccessful +execution of hipModuleOccupancyMaxActiveBlocksPerMultiprocessor api when parameters are invalid +*/ +#include "occupancy_common.hh" + +TEST_CASE("Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Negative_Parameters") { + hipModule_t module; + hipFunction_t function; + int blockSize = 0; + int gridSize = 0; + + HIP_CHECK(hipFree(nullptr)); + + HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code")); + HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel")); + + // Get potential blocksize + HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, 0, 0)); + + // Common negative tests + MaxActiveBlocksPerMultiprocessorNegative( + [&function](int* numBlocks, int blockSize, size_t dynSharedMemPerBlk) { + return hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, function, blockSize, + dynSharedMemPerBlk); + }, + blockSize); + + HIP_CHECK(hipModuleUnload(module)); +} + +TEST_CASE("Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessor_Positive_RangeValidation") { + hipDeviceProp_t devProp; + hipModule_t module; + hipFunction_t function; + int blockSize = 0; + int gridSize = 0; + + HIP_CHECK(hipFree(nullptr)); + + HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code")); + HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel")); + + HIP_CHECK(hipGetDeviceProperties(&devProp, 0)); + + SECTION("dynSharedMemPerBlk = 0") { + // Get potential blocksize + HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, 0, 0)); + + MaxActiveBlocksPerMultiprocessor( + [blockSize, &function](int* numBlocks) { + return hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, function, blockSize, + 0); + }, + blockSize, devProp.maxThreadsPerMultiProcessor); + } + SECTION("dynSharedMemPerBlk = sharedMemPerBlock") { + // Get potential blocksize + HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, + devProp.sharedMemPerBlock, 0)); + + MaxActiveBlocksPerMultiprocessor( + [blockSize, devProp, &function](int* numBlocks) { + return hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, function, blockSize, + devProp.sharedMemPerBlock); + }, + blockSize, devProp.maxThreadsPerMultiProcessor); + } + + HIP_CHECK(hipModuleUnload(module)); +} diff --git a/projects/hip-tests/catch/unit/occupancy/hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.cc b/projects/hip-tests/catch/unit/occupancy/hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.cc new file mode 100644 index 0000000000..8df78e4481 --- /dev/null +++ b/projects/hip-tests/catch/unit/occupancy/hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.cc @@ -0,0 +1,103 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +/* +Testcase Scenarios : +Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Positive_RangeValidation - Test +correct execution of hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags for diffrent +parameter values +Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters - Test +unsuccessful execution of hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags api when +parameters are invalid +*/ +#include "occupancy_common.hh" + +TEST_CASE("Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Negative_Parameters") { + hipModule_t module; + hipFunction_t function; + int numBlocks = 0; + int blockSize = 0; + int gridSize = 0; + + HIP_CHECK(hipFree(nullptr)); + + HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code")); + HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel")); + + // Get potential blocksize + HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, 0, 0)); + + // Common negative tests + MaxActiveBlocksPerMultiprocessorNegative( + [&function](int* numBlocks, int blockSize, size_t dynSharedMemPerBlk) { + return hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + numBlocks, function, blockSize, dynSharedMemPerBlk, hipOccupancyDefault); + }, + blockSize); + + SECTION("Flag is invalid") { + // Only default flag is supported + HIP_CHECK_ERROR(hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + &numBlocks, function, blockSize, 0, 2), + hipErrorInvalidValue); + } + + HIP_CHECK(hipModuleUnload(module)); +} + +TEST_CASE( + "Unit_hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_Positive_RangeValidation") { + hipDeviceProp_t devProp; + hipModule_t module; + hipFunction_t function; + int blockSize = 0; + int gridSize = 0; + + HIP_CHECK(hipFree(nullptr)); + + HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code")); + HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel")); + + HIP_CHECK(hipGetDeviceProperties(&devProp, 0)); + + SECTION("dynSharedMemPerBlk = 0") { + // Get potential blocksize + HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, 0, 0)); + + MaxActiveBlocksPerMultiprocessor( + [blockSize, &function](int* numBlocks) { + return hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + numBlocks, function, blockSize, 0, hipOccupancyDefault); + }, + blockSize, devProp.maxThreadsPerMultiProcessor); + } + SECTION("dynSharedMemPerBlk = sharedMemPerBlock") { + // Get potential blocksize + HIP_CHECK(hipModuleOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, function, + devProp.sharedMemPerBlock, 0)); + + MaxActiveBlocksPerMultiprocessor( + [blockSize, devProp, &function](int* numBlocks) { + return hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + numBlocks, function, blockSize, devProp.sharedMemPerBlock, hipOccupancyDefault); + }, + blockSize, devProp.maxThreadsPerMultiProcessor); + } + + HIP_CHECK(hipModuleUnload(module)); +} diff --git a/projects/hip-tests/catch/unit/occupancy/hipModuleOccupancyMaxPotentialBlockSize.cc b/projects/hip-tests/catch/unit/occupancy/hipModuleOccupancyMaxPotentialBlockSize.cc new file mode 100644 index 0000000000..6f49b9efe8 --- /dev/null +++ b/projects/hip-tests/catch/unit/occupancy/hipModuleOccupancyMaxPotentialBlockSize.cc @@ -0,0 +1,75 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +/* +Testcase Scenarios : +Unit_hipModuleOccupancyMaxPotentialBlockSize_Positive_RangeValidation - Test correct execution of +hipModuleOccupancyMaxPotentialBlockSize for diffrent parameter values +Unit_hipModuleOccupancyMaxPotentialBlockSize_Negative_Parameters - Test unsuccessful execution of +hipModuleOccupancyMaxPotentialBlockSize api when parameters are invalid +*/ +#include "occupancy_common.hh" + +TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSize_Negative_Parameters") { + hipModule_t module; + hipFunction_t function; + + HIP_CHECK(hipFree(nullptr)); + + HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code")); + HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel")); + + // Common negative tests + MaxPotentialBlockSizeNegative([&function](int* gridSize, int* blockSize) { + return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, function, 0, 0); + }); + + HIP_CHECK(hipModuleUnload(module)); +} + +TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSize_Positive_RangeValidation") { + hipDeviceProp_t devProp; + hipModule_t module; + hipFunction_t function; + + HIP_CHECK(hipFree(nullptr)); + + HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code")); + HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel")); + + HIP_CHECK(hipGetDeviceProperties(&devProp, 0)); + + SECTION("dynSharedMemPerBlk = 0, blockSizeLimit = 0") { + MaxPotentialBlockSize( + [&function](int* gridSize, int* blockSize) { + return hipModuleOccupancyMaxPotentialBlockSize(gridSize, blockSize, function, 0, 0); + }, + devProp.maxThreadsPerBlock); + } + + SECTION("dynSharedMemPerBlk = sharedMemPerBlock, blockSizeLimit = maxThreadsPerBlock") { + MaxPotentialBlockSize( + [&function, devProp](int* gridSize, int* blockSize) { + return hipModuleOccupancyMaxPotentialBlockSize( + gridSize, blockSize, function, devProp.sharedMemPerBlock, devProp.maxThreadsPerBlock); + }, + devProp.maxThreadsPerBlock); + } + + HIP_CHECK(hipModuleUnload(module)); +} diff --git a/projects/hip-tests/catch/unit/occupancy/hipModuleOccupancyMaxPotentialBlockSizeWithFlags.cc b/projects/hip-tests/catch/unit/occupancy/hipModuleOccupancyMaxPotentialBlockSizeWithFlags.cc new file mode 100644 index 0000000000..50107b465f --- /dev/null +++ b/projects/hip-tests/catch/unit/occupancy/hipModuleOccupancyMaxPotentialBlockSizeWithFlags.cc @@ -0,0 +1,87 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +/* +Testcase Scenarios : +Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Positive_RangeValidation - Test correct +execution of hipModuleOccupancyMaxPotentialBlockSizeWithFlags for diffrent parameter values +Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters - Test unsuccessful +execution of hipModuleOccupancyMaxPotentialBlockSizeWithFlags api when parameters are invalid +*/ +#include "occupancy_common.hh" + +TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Negative_Parameters") { + hipModule_t module; + hipFunction_t function; + int blockSize = 0; + int gridSize = 0; + + HIP_CHECK(hipFree(nullptr)); + + HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code")); + HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel")); + + // Common negative tests + MaxPotentialBlockSizeNegative([&function](int* gridSize, int* blockSize) { + return hipModuleOccupancyMaxPotentialBlockSizeWithFlags(gridSize, blockSize, function, 0, 0, + hipOccupancyDefault); + }); + + SECTION("Flag is invalid") { + // Only default flag is supported + HIP_CHECK_ERROR( + hipModuleOccupancyMaxPotentialBlockSizeWithFlags(&gridSize, &blockSize, function, 0, 0, 2), + hipErrorInvalidValue); + } + + HIP_CHECK(hipModuleUnload(module)); +} + +TEST_CASE("Unit_hipModuleOccupancyMaxPotentialBlockSizeWithFlags_Positive_RangeValidation") { + hipDeviceProp_t devProp; + hipModule_t module; + hipFunction_t function; + + HIP_CHECK(hipFree(nullptr)); + + HIP_CHECK(hipModuleLoad(&module, "simple_kernel.code")); + HIPCHECK(hipModuleGetFunction(&function, module, "SimpleKernel")); + + HIP_CHECK(hipGetDeviceProperties(&devProp, 0)); + + SECTION("dynSharedMemPerBlk = 0, blockSizeLimit = 0") { + MaxPotentialBlockSize( + [&function](int* gridSize, int* blockSize) { + return hipModuleOccupancyMaxPotentialBlockSizeWithFlags(gridSize, blockSize, function, 0, + 0, hipOccupancyDefault); + }, + devProp.maxThreadsPerBlock); + } + + SECTION("dynSharedMemPerBlk = sharedMemPerBlock, blockSizeLimit = maxThreadsPerBlock") { + MaxPotentialBlockSize( + [&function, devProp](int* gridSize, int* blockSize) { + return hipModuleOccupancyMaxPotentialBlockSizeWithFlags( + gridSize, blockSize, function, devProp.sharedMemPerBlock, devProp.maxThreadsPerBlock, + hipOccupancyDefault); + }, + devProp.maxThreadsPerBlock); + } + + HIP_CHECK(hipModuleUnload(module)); +} diff --git a/projects/hip-tests/catch/unit/occupancy/occupancy_common.hh b/projects/hip-tests/catch/unit/occupancy/occupancy_common.hh index d03caad35b..5d71c7dac0 100644 --- a/projects/hip-tests/catch/unit/occupancy/occupancy_common.hh +++ b/projects/hip-tests/catch/unit/occupancy/occupancy_common.hh @@ -66,7 +66,5 @@ template void MaxActiveBlocksPerMultiprocessorNegative(F func, int SECTION("numBlocks is nullptr") { HIP_CHECK_ERROR(func(nullptr, blockSize, 0), hipErrorInvalidValue); } - SECTION("Block size is 0") { - HIP_CHECK_ERROR(func(&numBlocks, 0, 0), hipErrorInvalidValue); - } + SECTION("Block size is 0") { HIP_CHECK_ERROR(func(&numBlocks, 0, 0), hipErrorInvalidValue); } } diff --git a/projects/hip-tests/catch/unit/occupancy/simple_kernel.cc b/projects/hip-tests/catch/unit/occupancy/simple_kernel.cc new file mode 100644 index 0000000000..e1bf415e95 --- /dev/null +++ b/projects/hip-tests/catch/unit/occupancy/simple_kernel.cc @@ -0,0 +1,25 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "hip/hip_runtime.h" + +extern "C" __global__ void SimpleKernel(int* a, int* b) { + int tx = threadIdx.x; + b[tx] = a[tx]; +} diff --git a/projects/hip-tests/catch/unit/warp/CMakeLists.txt b/projects/hip-tests/catch/unit/warp/CMakeLists.txt new file mode 100644 index 0000000000..cd2ec5226b --- /dev/null +++ b/projects/hip-tests/catch/unit/warp/CMakeLists.txt @@ -0,0 +1,9 @@ +# Common Tests - Test independent of all platforms +set(TEST_SRC + warp_shfl_xor.cc + warp_shfl.cc +) + +hip_add_exe_to_target(NAME WarpTest + TEST_SRC ${TEST_SRC} + TEST_TARGET_NAME build_tests) diff --git a/projects/hip-tests/catch/unit/warp/warp_common.hh b/projects/hip-tests/catch/unit/warp/warp_common.hh new file mode 100644 index 0000000000..d09e96837e --- /dev/null +++ b/projects/hip-tests/catch/unit/warp/warp_common.hh @@ -0,0 +1,84 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include +#include + +static __device__ bool deactivate_thread(const uint64_t* const active_masks) { + const auto warp = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block(), warpSize); + const auto block = cooperative_groups::this_thread_block(); + const auto warps_per_block = (block.size() + warpSize - 1) / warpSize; + const auto block_rank = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x; + const auto idx = block_rank * warps_per_block + block.thread_rank() / warpSize; + + return !(active_masks[idx] & (static_cast(1) << warp.thread_rank())); +} + +static inline std::mt19937& GetRandomGenerator() { + static std::mt19937 mt(std::random_device{}()); + return mt; +} + +template static inline T GenerateRandomInteger(const T min, const T max) { + std::uniform_int_distribution dist(min, max); + return dist(GetRandomGenerator()); +} + +template static inline T GenerateRandomReal(const T min, const T max) { + std::uniform_real_distribution dist(min, max); + return dist(GetRandomGenerator()); +} + +inline int generate_width(int warp_size) { + int exponent = 0; + while (warp_size >>= 1) { + ++exponent; + } + + return GENERATE_COPY(map([](int e) { return 1 << e; }, range(1, exponent + 1))); +} + +inline uint64_t get_active_mask(unsigned int warp_id, unsigned int warp_size) { + uint64_t active_mask = 0; + switch (warp_id % 5) { + case 0: // even threads in the warp + active_mask = 0xAAAAAAAAAAAAAAAA; + break; + case 1: // odd threads in the warp + active_mask = 0x5555555555555555; + break; + case 2: // first half of the warp + for (int i = 0; i < warp_size / 2; i++) { + active_mask = active_mask | (static_cast(1) << i); + } + break; + case 3: // second half of the warp + for (int i = warp_size / 2; i < warp_size; i++) { + active_mask = active_mask | (static_cast(1) << i); + } + break; + case 4: // all threads + active_mask = 0xFFFFFFFFFFFFFFFF; + break; + } + return active_mask; +} diff --git a/projects/hip-tests/catch/unit/warp/warp_shfl.cc b/projects/hip-tests/catch/unit/warp/warp_shfl.cc new file mode 100644 index 0000000000..babb814fe4 --- /dev/null +++ b/projects/hip-tests/catch/unit/warp/warp_shfl.cc @@ -0,0 +1,121 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "warp_shfl_common.hh" + +#include + +/** + * @addtogroup shfl shfl + * @{ + * @ingroup DeviceLanguageTest + * `T __shfl(T var, int src_lane, int width = warpSize)` - + * Contains unit test for warp shfl function + */ + +namespace cg = cooperative_groups; + +template +__global__ void shfl(T* const out, const T* const in, const uint64_t* const active_masks, + const uint8_t* const src_lanes, const int width) { + if (deactivate_thread(active_masks)) { + return; + } + const auto grid = cg::this_grid(); + const auto block = cg::this_thread_block(); + T var = in[grid.thread_rank()]; + out[grid.thread_rank()] = __shfl(var, src_lanes[block.thread_rank() % width], width); +} + +template class WarpShfl : public WarpShflTest, T> { + public: + void launch_kernel(T* const arr_dev, T* const input_dev, const uint64_t* const active_masks) { + width_ = generate_width(this->warp_size_); + INFO("Width: " << width_); + const auto alloc_size = width_ * sizeof(uint8_t); + LinearAllocGuard src_lanes_dev(LinearAllocs::hipMalloc, alloc_size); + src_lanes_.resize(width_); + std::generate(src_lanes_.begin(), src_lanes_.end(), + [this] { return GenerateRandomInteger(0, static_cast(2 * width_)); }); + + HIP_CHECK(hipMemcpy(src_lanes_dev.ptr(), src_lanes_.data(), alloc_size, hipMemcpyHostToDevice)); + shfl<<grid_.grid_dim_, this->grid_.block_dim_>>>(arr_dev, input_dev, active_masks, + src_lanes_dev.ptr(), width_); + } + + void validate(const T* const arr, const T* const input) { + ArrayAllOf(arr, this->grid_.thread_count_, [this, &input](unsigned int i) -> std::optional { + const auto rank_in_block = this->grid_.thread_rank_in_block(i).value(); + const auto rank_in_warp = rank_in_block % this->warp_size_; + const auto rank_in_partition = rank_in_block % width_; + const int src_lane = src_lanes_[rank_in_partition] % width_; + const int src_offset = src_lane - rank_in_partition; + + const auto mask_idx = this->warps_in_block_ * (i / this->grid_.threads_in_block_count_) + + rank_in_block / this->warp_size_; + const std::bitset active_mask(this->active_masks_[mask_idx]); + + if (!active_mask.test(rank_in_warp) || (!active_mask.test((rank_in_warp + src_offset))) || + (rank_in_block + src_offset >= this->grid_.threads_in_block_count_)) { + return std::nullopt; + } + + return input[i + src_offset]; + }); + }; + + private: + std::vector src_lanes_; + int width_; +}; + +/** + * Test Description + * ------------------------ + * - Validates the warp shuffle behavior for all valid width sizes {2, 4, 8, 16, 32, + * 64(if supported)} for generated shuffle target lanes. The threads are deactivated based on the + * passed active mask. The test is run for all overloads of shfl. + * Test source + * ------------------------ + * - unit/warp/warp_shfl.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + * - Device supports warp shuffle + */ +TEMPLATE_TEST_CASE("Unit_Warp_Shfl_Positive_Basic", "", int, unsigned int, long, unsigned long, + long long, unsigned long long, float, double) { + int device; + hipDeviceProp_t device_properties; + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); + + if (!device_properties.arch.hasWarpShuffle) { + HipTest::HIP_SKIP_TEST("Device doesn't support Warp Shuffle!"); + return; + } + + SECTION("Shfl with specified active mask and input values") { + WarpShfl().run(false); + } + + SECTION("Shfl with random active mask and input values") { + WarpShfl().run(true); + } +} diff --git a/projects/hip-tests/catch/unit/warp/warp_shfl_common.hh b/projects/hip-tests/catch/unit/warp/warp_shfl_common.hh new file mode 100644 index 0000000000..97b2677f31 --- /dev/null +++ b/projects/hip-tests/catch/unit/warp/warp_shfl_common.hh @@ -0,0 +1,114 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include "warp_common.hh" + +#include +#include +#include + +template class WarpShflTest { + public: + WarpShflTest() : warp_size_{get_warp_size()} {} + + void run(bool random = false) { + const auto blocks = GenerateBlockDimensionsForShuffle(); + INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z); + const auto threads = GenerateThreadDimensionsForShuffle(); + INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z); + grid_ = CPUGrid(blocks, threads); + + const auto alloc_size = grid_.thread_count_ * sizeof(T); + LinearAllocGuard input_dev(LinearAllocs::hipMalloc, alloc_size); + LinearAllocGuard input(LinearAllocs::hipHostMalloc, alloc_size); + LinearAllocGuard arr_dev(LinearAllocs::hipMalloc, alloc_size); + LinearAllocGuard arr(LinearAllocs::hipHostMalloc, alloc_size); + HIP_CHECK(hipMemset(arr_dev.ptr(), 0, alloc_size)); + + warps_in_block_ = (grid_.threads_in_block_count_ + warp_size_ - 1) / warp_size_; + const auto warps_in_grid = warps_in_block_ * grid_.block_count_; + LinearAllocGuard active_masks_dev(LinearAllocs::hipMalloc, + warps_in_grid * sizeof(uint64_t)); + active_masks_.resize(warps_in_grid); + + generate_input(input.ptr(), random); + + HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks_.data(), + warps_in_grid * sizeof(uint64_t), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(input_dev.ptr(), input.ptr(), alloc_size, hipMemcpyHostToDevice)); + cast_to_derived().launch_kernel(arr_dev.ptr(), input_dev.ptr(), active_masks_dev.ptr()); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipDeviceSynchronize()); + + cast_to_derived().validate(arr.ptr(), input.ptr()); + } + + private: + int get_warp_size() const { + int current_dev = -1; + HIP_CHECK(hipGetDevice(¤t_dev)); + int warp_size = 0u; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + return warp_size; + } + + void generate_input(T* input, bool random) { + if (random) { + std::generate(active_masks_.begin(), active_masks_.end(), [] { + return GenerateRandomInteger(0ul, std::numeric_limits().max()); + }); + + if constexpr (std::is_same_v || std::is_same_v) { + std::generate_n(input, grid_.thread_count_, [] { + return static_cast( + GenerateRandomReal(std::numeric_limits().min(), std::numeric_limits().max())); + }); + } else { + std::generate_n(input, grid_.thread_count_, [] { + return static_cast(GenerateRandomInteger(std::numeric_limits().min(), + std::numeric_limits().max())); + }); + } + } else { + unsigned long long int i = 0; + std::generate(active_masks_.begin(), active_masks_.end(), + [this, &i]() { return get_active_mask(i++, warp_size_); }); + + i = 0; + std::generate_n(input, grid_.thread_count_, [&i]() { + if (static_cast(i) > std::numeric_limits().max()) + i = 0; + else + i++; + return static_cast(i); + }); + } + } + + Derived& cast_to_derived() { return reinterpret_cast(*this); } + + protected: + const int warp_size_; + CPUGrid grid_; + unsigned int warps_in_block_; + std::vector active_masks_; +}; diff --git a/projects/hip-tests/catch/unit/warp/warp_shfl_xor.cc b/projects/hip-tests/catch/unit/warp/warp_shfl_xor.cc new file mode 100644 index 0000000000..3edbca1b3a --- /dev/null +++ b/projects/hip-tests/catch/unit/warp/warp_shfl_xor.cc @@ -0,0 +1,118 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "warp_shfl_common.hh" + +#include + +/** + * @addtogroup shfl_xor shfl_xor + * @{ + * @ingroup DeviceLanguageTest + * `T __shfl_xor(T var, int lane_mask, int width = warpSize)` - + * Contains unit test for warp shfl_xor function + */ + +namespace cg = cooperative_groups; + +template +__global__ void shfl_xor(T* const out, const T* const in, const uint64_t* const active_masks, + const int lane_mask, const int width) { + if (deactivate_thread(active_masks)) { + return; + } + + const auto grid = cg::this_grid(); + T var = in[grid.thread_rank()]; + out[grid.thread_rank()] = __shfl_xor(var, lane_mask, width); +} + +template class WarpShflXOR : public WarpShflTest, T> { + public: + void launch_kernel(T* const arr_dev, T* const input_dev, const uint64_t* const active_masks) { + width_ = generate_width(this->warp_size_); + INFO("Width: " << width_); + lane_mask_ = GENERATE_COPY(range(0, this->warp_size_)); + INFO("Lane mask: " << lane_mask_); + shfl_xor<<grid_.grid_dim_, this->grid_.block_dim_>>>(arr_dev, input_dev, active_masks, + lane_mask_, width_); + } + + void validate(const T* const arr, const T* const input) { + ArrayAllOf(arr, this->grid_.thread_count_, [this, &input](unsigned int i) -> std::optional { + const auto rank_in_block = this->grid_.thread_rank_in_block(i).value(); + const auto rank_in_warp = rank_in_block % this->warp_size_; + const int warp_target = rank_in_warp ^ this->lane_mask_; + const int target_offset = warp_target - rank_in_warp; + const auto mask_idx = this->warps_in_block_ * (i / this->grid_.threads_in_block_count_) + + rank_in_block / this->warp_size_; + const std::bitset active_mask(this->active_masks_[mask_idx]); + + const auto target_partition = warp_target / width_; + const auto partition_rank = rank_in_warp / width_; + if (!active_mask.test(rank_in_warp) || + (target_partition <= partition_rank && !active_mask.test(rank_in_warp + target_offset)) || + (target_partition <= partition_rank && + rank_in_block + target_offset >= this->grid_.threads_in_block_count_)) { + return std::nullopt; + } + + return target_partition > partition_rank ? input[i] : input[i + target_offset]; + }); + }; + + private: + int lane_mask_; + int width_; +}; + +/** + * Test Description + * ------------------------ + * - Validates the warp shuffle xor behavior for all valid width sizes {2, 4, 8, 16, 32, + * 64(if supported)} for mask values of [0, width). The threads are deactivated based on the + * passed active mask. The test is run for all overloads of shfl_xor. + * Test source + * ------------------------ + * - unit/warp/warp_shfl_xor.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + * - Device supports warp shuffle + */ +TEMPLATE_TEST_CASE("Unit_Warp_Shfl_XOR_Positive_Basic", "", int, unsigned int, long, unsigned long, + long long, unsigned long long, float, double) { + int device; + hipDeviceProp_t device_properties; + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK(hipGetDeviceProperties(&device_properties, device)); + + if (!device_properties.arch.hasWarpShuffle) { + HipTest::HIP_SKIP_TEST("Device doesn't support Warp Shuffle!"); + return; + } + + SECTION("Shfl Xor with specified active mask and input values") { + WarpShflXOR().run(false); + } + + SECTION("Shfl Xor with random active mask and input values") { + WarpShflXOR().run(true); + } +} \ No newline at end of file